You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/09/27 16:34:24 UTC

[01/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Repository: incubator-singa
Updated Branches:
  refs/heads/master 4e15c3444 -> 2f665370b


SINGA-72 Minor updates to be consisten with documentation

Modify version number. Remove cmake and opencv in install.sh.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3d1b0dc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3d1b0dc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3d1b0dc1

Branch: refs/heads/master
Commit: 3d1b0dc1f0ce044534334e88f143d13d60294022
Parents: 8e7c6cc
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Tue Sep 22 14:27:39 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:23:33 2015 +0800

----------------------------------------------------------------------
 configure.ac          |   2 +-
 thirdparty/install.sh | 136 ++++++++++++++++++++++-----------------------
 2 files changed, 69 insertions(+), 69 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d1b0dc1/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index bc11c34..1ae5cec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,7 +21,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT(singa, 0.1, dev@singa.incubator.apache.org)
+AC_INIT(singa, 0.1.0, dev@singa.incubator.apache.org)
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR(config)
 AC_CONFIG_SRCDIR([src/utils/common.cc])

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d1b0dc1/thirdparty/install.sh
----------------------------------------------------------------------
diff --git a/thirdparty/install.sh b/thirdparty/install.sh
index 7bce5af..b9a3095 100755
--- a/thirdparty/install.sh
+++ b/thirdparty/install.sh
@@ -361,27 +361,27 @@ cd $BIN
 while [ $# != 0 ]
 do
 	case $1 in
-	"cmake")
-		echo "install cmake";
-		if [[ $2 == */* ]];then
-			install_cmake $2;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during cmake installation" ;
-		        exit;
-		    fi  
-			shift
-			shift
-		else
-			install_cmake;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during cmake installation" ;
-		        exit;
-		    fi  
-			shift
-		fi
-		;;
+#	"cmake")
+#		echo "install cmake";
+#		if [[ $2 == */* ]];then
+#			install_cmake $2;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during cmake installation" ;
+#		        exit;
+#		    fi  
+#			shift
+#			shift
+#		else
+#			install_cmake;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during cmake installation" ;
+#		        exit;
+#		    fi  
+#			shift
+#		fi
+#		;;
 	"czmq")
 		echo "install czmq";
 		if [ $2 == "-f" ]
@@ -496,27 +496,27 @@ do
 			shift
 		fi
 		;;
-	"opencv")
-		echo "install opencv";
-		if [[ $2 == */* ]];then
-			install_opencv $2;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during opencv installation" ;
-		        exit;
-		    fi  
-			shift
-			shift
-		else
-			install_opencv;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during opencv installation" ;
-		        exit;
-		    fi  
-			shift
-		fi
-		;;
+#	"opencv")
+#		echo "install opencv";
+#		if [[ $2 == */* ]];then
+#			install_opencv $2;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during opencv installation" ;
+#		        exit;
+#		    fi  
+#			shift
+#			shift
+#		else
+#			install_opencv;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during opencv installation" ;
+#		        exit;
+#		    fi  
+#			shift
+#		fi
+#		;;
 	 "protobuf")
 		echo "install protobuf";
 		if [[ $2 == */* ]];then
@@ -583,12 +583,12 @@ do
 	"all")
 		echo "install all dependencies";
 		if [[ $2 == */* ]];then
-			install_cmake $2;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during cmake installation" ;
-		        exit;
-		    fi  
+#			install_cmake $2;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during cmake installation" ;
+#		        exit;
+#		    fi  
 			install_zeromq $2;
 		    if [ $? -ne 0 ] 
 		    then
@@ -619,12 +619,12 @@ do
 		        echo "ERROR during openblas installation" ;
 		        exit;
 		    fi  
-			install_opencv $2;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during opencv installation" ;
-		        exit;
-		    fi  
+#			install_opencv $2;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during opencv installation" ;
+#		        exit;
+#		    fi  
 			install_protobuf $2;
 		    if [ $? -ne 0 ] 
 		    then
@@ -640,12 +640,12 @@ do
 			shift
 			shift
 		else
-			install_cmake;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during cmake installation" ;
-		        exit;
-		    fi  
+#			install_cmake;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during cmake installation" ;
+#		        exit;
+#		    fi  
 			install_zeromq;
 		    if [ $? -ne 0 ] 
 		    then
@@ -676,12 +676,12 @@ do
 		        echo "ERROR during openblas installation" ;
 		        exit;
 		    fi  
-			install_opencv;
-		    if [ $? -ne 0 ] 
-		    then
-		        echo "ERROR during opencv installation" ;
-		        exit;
-		    fi  
+#			install_opencv;
+#		    if [ $? -ne 0 ] 
+#		    then
+#		        echo "ERROR during opencv installation" ;
+#		        exit;
+#		    fi  
 			install_protobuf;
 		    if [ $? -ne 0 ] 
 		    then
@@ -700,12 +700,12 @@ do
 	*)
 		echo "USAGE: ./install.sh [MISSING_LIBRARY_NAME1] [YOUR_INSTALL_PATH1] [MISSING_LIBRARY_NAME2] [YOUR_INSTALL_PATH2] ...";
 		echo " MISSING_LIBRARY_NAME can be:	"
-		echo "	cmake"
+#		echo "	cmake"
 		echo "	czmq"
 		echo "	glog"
 		echo "	lmdb"
 		echo "	OpenBLAS"
-		echo "	opencv"
+#		echo "	opencv"
 		echo "	protobuf"
 		echo "	zeromq"
 		echo "	zookeeper"


[09/13] incubator-singa git commit: SINGA-70 Refactor API of Layer, Worker, Server and Driver

Posted by wa...@apache.org.
SINGA-70 Refactor API of Layer, Worker, Server and Driver

For Layer class
* Setup, ComputeFeature and ComputeGradient are updated to accept one
argument which represents all source layers.
* DebugString() is changed to ToString() for displaying debug info and
other info. For example, the performance values can be aggregated in
ComputeFeature function and then converted into string in ToString().
* the srclayer and dstlayer fields are removed.

For Worker class
* Report function is removed. The performance is now collected via
Layer::ToString() and is reported by each worker.

The Trainer class is renamed to Stub
* Only the Run() function and message handling/generation functions are
remained. Functions for creating servers and workers are moved into
Driver.

The Driver class
* Rename function Submit to Train.
* Add functions for creating workers and servers.

All files under trainer folder are moved outside to be under src/ or
include/.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/321ef96a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/321ef96a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/321ef96a

Branch: refs/heads/master
Commit: 321ef96a6e90ce7c70fae7e05b446c3dd38a3fef
Parents: ab984da
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Sep 25 23:31:53 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:37:36 2015 +0800

----------------------------------------------------------------------
 .gitignore                           |   4 +-
 Makefile.am                          |  27 +-
 examples/rbm/autoencoder.conf        |   8 +-
 examples/rbm/rbm2.conf               |   2 +-
 examples/rbm/rbm3.conf               |   2 +-
 examples/rbm/rbm4.conf               |   2 +-
 examples/rnnlm/job.conf              |   6 +-
 examples/rnnlm/main.cc               |   2 +-
 examples/rnnlm/rnnlm.cc              | 121 ++++----
 examples/rnnlm/rnnlm.h               |  34 ++-
 include/comm/msg.h                   | 238 +++++++++++++++
 include/comm/socket.h                | 174 +++++++++++
 include/communication/msg.h          | 238 ---------------
 include/communication/socket.h       | 174 -----------
 include/driver.h                     |  66 ++++-
 include/neuralnet/connection_layer.h |  47 +--
 include/neuralnet/input_layer.h      |  48 ++-
 include/neuralnet/layer.h            | 212 ++++++++------
 include/neuralnet/loss_layer.h       |  26 +-
 include/neuralnet/neuralnet.h        |  33 ++-
 include/neuralnet/neuron_layer.h     |  80 ++---
 include/server.h                     | 133 +++++++++
 include/singa.h                      |   9 +-
 include/stub.h                       | 109 +++++++
 include/trainer/server.h             | 132 ---------
 include/trainer/trainer.h            | 163 -----------
 include/trainer/worker.h             | 258 ----------------
 include/utils/param.h                |  38 ++-
 include/worker.h                     | 311 ++++++++++++++++++++
 src/comm/msg.cc                      | 215 ++++++++++++++
 src/comm/socket.cc                   | 180 ++++++++++++
 src/communication/msg.cc             | 215 --------------
 src/communication/socket.cc          | 180 ------------
 src/driver.cc                        | 203 ++++++++++++-
 src/main.cc                          |  22 +-
 src/neuralnet/connection_layer.cc    |  66 +++--
 src/neuralnet/input_layer.cc         |  75 ++---
 src/neuralnet/layer.cc               |  19 +-
 src/neuralnet/loss_layer.cc          |  68 +++--
 src/neuralnet/neuralnet.cc           |  21 +-
 src/neuralnet/neuron_layer.cc        | 285 +++++++++---------
 src/proto/job.proto                  |  14 +-
 src/server.cc                        | 269 +++++++++++++++++
 src/stub.cc                          | 285 ++++++++++++++++++
 src/trainer/server.cc                | 263 -----------------
 src/trainer/trainer.cc               | 469 ------------------------------
 src/trainer/worker.cc                | 411 --------------------------
 src/utils/cluster.cc                 |   6 +-
 src/utils/common.cc                  |   8 +-
 src/utils/param.cc                   |  54 +++-
 src/worker.cc                        | 410 ++++++++++++++++++++++++++
 51 files changed, 3330 insertions(+), 3105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 841b0d6..7ac9bc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,10 +42,10 @@ stamp-h1
 *.status
 config.h
 Makefile
-thirdparty/*
 config/*
 config.h.in
 configure
 aclocal.m4
 Makefile.in
-!thirdpary/install.sh
+thirdparty/*
+!thirdparty/install.sh

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index 3f68e29..00aacdd 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,7 +8,7 @@ DEFAULT_FLAGS = -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
               $(MSHADOW_FLAGS) -DCPU_ONLY=1 -funroll-loops -DTHREADED
 
 CFLAGS = $(DEBUG)
-CXXFLAGS = $(DEBUG) 
+CXXFLAGS = $(DEBUG)
 AC_CXXFLAGS = $(DEBUG)
 
 INCLUDES = -I$(top_srcdir)/include
@@ -35,9 +35,9 @@ SINGA_SRCS := src/driver.cc \
               src/utils/updater.cc \
               src/utils/data_shard.cc \
               src/utils/blob.cc \
-              src/trainer/server.cc \
-              src/trainer/worker.cc \
-              src/trainer/trainer.cc \
+              src/server.cc \
+              src/worker.cc \
+              src/stub.cc \
               src/neuralnet/layer.cc \
               src/neuralnet/connection_layer.cc \
               src/neuralnet/input_layer.cc \
@@ -45,8 +45,8 @@ SINGA_SRCS := src/driver.cc \
               src/neuralnet/neuron_layer.cc \
               src/neuralnet/output_layer.cc \
               src/neuralnet/neuralnet.cc \
-              src/communication/socket.cc \
-              src/communication/msg.cc
+              src/comm/socket.cc \
+              src/comm/msg.cc
 
 SINGA_HDRS := include/singa.h \
               include/utils/cluster.h \
@@ -60,9 +60,9 @@ SINGA_HDRS := include/singa.h \
               include/utils/blob.h \
               include/utils/updater.h \
               include/utils/tinydir.h \
-              include/trainer/server.h \
-              include/trainer/worker.h \
-              include/trainer/trainer.h \
+              include/server.h \
+              include/worker.h \
+              include/stub.h \
               include/neuralnet/layer.h \
               include/neuralnet/connection_layer.h \
               include/neuralnet/input_layer.h \
@@ -78,8 +78,8 @@ SINGA_HDRS := include/singa.h \
               include/mshadow/cxxnet_op.h \
               include/mshadow/tensor_base.h \
               include/mshadow/tensor_random.h \
-              include/communication/msg.h \
-              include/communication/socket.h
+              include/comm/msg.h \
+              include/comm/socket.h
 
 GTEST_SRCS := include/gtest/gtest-all.cc
 GTEST_HRDS := include/gtest/gtest.h
@@ -108,10 +108,9 @@ endif
 libsinga_la_LDFLAGS = -I./include
 
 
-
 #bin_PROGRAMS = singa
 singa_SOURCES = src/main.cc
-singa_CXXFLAGS = $(DEFAULT_FLAGS) -MMD 
+singa_CXXFLAGS = $(DEFAULT_FLAGS) -MMD
 singa_LDFLAGS = -I./include \
                 -lsinga \
                 -lglog  \
@@ -146,7 +145,7 @@ libgtest_la_LDFLAGS = -I./include
 #bin_PROGRAMS += singatest
 
 singatest_SOURCES = $(GTEST_HDRS) $(TEST_SRCS)
-singatest_CXXFLAGS = $(DEFAULT_FLAGS) 
+singatest_CXXFLAGS = $(DEFAULT_FLAGS)
 singatest_LDFLAGS = -I./include \
                 -lsinga \
                 -lglog  \

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rbm/autoencoder.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
index 29f7729..c818c6e 100644
--- a/examples/rbm/autoencoder.conf
+++ b/examples/rbm/autoencoder.conf
@@ -3,10 +3,10 @@ train_steps: 12200
 test_steps:100
 test_freq:1000
 disp_freq:100
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/rbm4/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
+checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0"
+checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0"
+checkpoint_path: "examples/rbm/rbm4/checkpoint/step6000-worker0"
 train_one_batch{
   alg: kBP
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rbm/rbm2.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
index 8a16e0f..52dc698 100644
--- a/examples/rbm/rbm2.conf
+++ b/examples/rbm/rbm2.conf
@@ -6,7 +6,7 @@ disp_freq: 100
 train_one_batch{
   alg: kCD
 }
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
 updater{
   type: kSGD
   momentum: 0.8

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rbm/rbm3.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
index 75848d6..354fb3b 100644
--- a/examples/rbm/rbm3.conf
+++ b/examples/rbm/rbm3.conf
@@ -6,7 +6,7 @@ disp_freq: 100
 train_one_batch{
   alg: kCD
 }
-checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0"
 
 updater{
   type: kSGD

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rbm/rbm4.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm4.conf b/examples/rbm/rbm4.conf
index 2b83afb..ebf39fa 100644
--- a/examples/rbm/rbm4.conf
+++ b/examples/rbm/rbm4.conf
@@ -6,7 +6,7 @@ disp_freq: 100
 train_one_batch{
   alg: kCD
 }
-checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0"
 updater{
     type: kSGD
     momentum: 0.8

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rnnlm/job.conf
----------------------------------------------------------------------
diff --git a/examples/rnnlm/job.conf b/examples/rnnlm/job.conf
index db96e84..021692f 100644
--- a/examples/rnnlm/job.conf
+++ b/examples/rnnlm/job.conf
@@ -2,8 +2,8 @@ name: "rnnlm"
 #To scan the training file (81350) 10 times
 train_steps:81350
 #To scan the validation file (6828) once
-valid_steps:683
-valid_freq:8135
+validate_steps:683
+validate_freq:8135
 #disp_freq is specific to training
 disp_freq:8135
 train_one_batch {
@@ -36,7 +36,7 @@ layer {
     path: "examples/rnnlm/train_shard"
     max_window: 10
   }
-  exclude: kValidation
+  exclude: kVal
 }
 
 layer {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rnnlm/main.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc
index 87db06a..ea1dcdd 100644
--- a/examples/rnnlm/main.cc
+++ b/examples/rnnlm/main.cc
@@ -40,6 +40,6 @@ int main(int argc, char **argv) {
 
   singa::JobProto jobConf = driver.job_conf();
 
-  driver.Submit(resume, jobConf);
+  driver.Train(resume, jobConf);
   return 0;
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rnnlm/rnnlm.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc
index 0ad6dcd..c086972 100644
--- a/examples/rnnlm/rnnlm.cc
+++ b/examples/rnnlm/rnnlm.cc
@@ -57,19 +57,19 @@ DataLayer::~DataLayer() {
   shard_ = nullptr;
 }
 
-void DataLayer::Setup(const LayerProto& proto, int npartitions) {
-  RNNLayer::Setup(proto, npartitions);
+void DataLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  RNNLayer::Setup(conf, srclayers);
   shard_ = new singa::DataShard(
-               proto.GetExtension(data_conf).path(),
+               conf.GetExtension(data_conf).path(),
                singa::DataShard::kRead);
   string key;
-  max_window_ = proto.GetExtension(data_conf).max_window();
+  max_window_ = conf.GetExtension(data_conf).max_window();
   records_.resize(max_window_ + 1);  // resize to # of records in data layer
   window_ = 0;
   shard_->Next(&key, &records_[window_]);
 }
 
-void DataLayer::ComputeFeature(int flag, Metric *perf) {
+void DataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   CHECK(records_.size() <= shard_->Count());
   records_[0] = records_[window_];
   window_ = max_window_;
@@ -88,17 +88,18 @@ void DataLayer::ComputeFeature(int flag, Metric *perf) {
 }
 
 /*******LabelLayer**************/
-void LabelLayer::Setup(const LayerProto& proto, int npartitions) {
-  RNNLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  int max_window = dynamic_cast<DataLayer*>(srclayers_[0])->max_window();
+void LabelLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  RNNLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  int max_window = dynamic_cast<DataLayer*>(srclayers[0])->max_window();
   data_.Reshape(vector<int>{max_window, 4});
 }
 
-void LabelLayer::ComputeFeature(int flag, Metric *perf) {
-  const auto& records = dynamic_cast<DataLayer*>(srclayers_[0])->records();
+void LabelLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  const auto& records = dynamic_cast<DataLayer*>(srclayers[0])->records();
   float *label = data_.mutable_cpu_data();
-  window_ = dynamic_cast<RNNLayer*>(srclayers_[0])->window();
+  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
   for (int i = 0; i < window_; i++) {
     WordRecord wordrecord = records[i + 1].GetExtension(word);
     label[4 * i + 0] = wordrecord.class_start();
@@ -113,20 +114,21 @@ EmbeddingLayer::~EmbeddingLayer() {
   delete embed_;
 }
 
-void EmbeddingLayer::Setup(const LayerProto& proto, int npartitions) {
-  RNNLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  int max_window = dynamic_cast<DataLayer*>(srclayers_[0])->max_window();
-  word_dim_ = proto.GetExtension(embedding_conf).word_dim();
+void EmbeddingLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  RNNLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  int max_window = dynamic_cast<DataLayer*>(srclayers[0])->max_window();
+  word_dim_ = conf.GetExtension(embedding_conf).word_dim();
   data_.Reshape(vector<int>{max_window, word_dim_});
   grad_.ReshapeLike(data_);
-  vocab_size_ = proto.GetExtension(embedding_conf).vocab_size();
-  embed_ = Param::Create(proto.param(0));
+  vocab_size_ = conf.GetExtension(embedding_conf).vocab_size();
+  embed_ = Param::Create(conf.param(0));
   embed_->Setup(vector<int>{vocab_size_, word_dim_});
 }
 
-void EmbeddingLayer::ComputeFeature(int flag, Metric* perf) {
-  auto datalayer = dynamic_cast<DataLayer*>(srclayers_[0]);
+void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
   window_ = datalayer->window();
   auto records = datalayer->records();
   auto words = RTensor2(&data_);
@@ -140,10 +142,11 @@ void EmbeddingLayer::ComputeFeature(int flag, Metric* perf) {
   }
 }
 
-void EmbeddingLayer::ComputeGradient(int flag, Metric* perf) {
+void EmbeddingLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
   auto grad = RTensor2(&grad_);
   auto gembed = RTensor2(embed_->mutable_grad());
-  auto datalayer = dynamic_cast<DataLayer*>(srclayers_[0]);
+  auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
   auto records = datalayer->records();
   gembed = 0;
   for (int t = 0; t < window_; t++) {
@@ -156,22 +159,23 @@ HiddenLayer::~HiddenLayer() {
   delete weight_;
 }
 
-void HiddenLayer::Setup(const LayerProto& proto, int npartitions) {
-  RNNLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  const auto& innerproductData = srclayers_[0]->data(this);
-  data_.ReshapeLike(srclayers_[0]->data(this));
-  grad_.ReshapeLike(srclayers_[0]->grad(this));
+void HiddenLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  RNNLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  const auto& innerproductData = srclayers[0]->data(this);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
   int word_dim = data_.shape()[1];
-  weight_ = Param::Create(proto.param(0));
+  weight_ = Param::Create(conf.param(0));
   weight_->Setup(std::vector<int>{word_dim, word_dim});
 }
 
 // hid[t] = sigmoid(hid[t-1] * W + src[t])
-void HiddenLayer::ComputeFeature(int flag, Metric* perf) {
-  window_ = dynamic_cast<RNNLayer*>(srclayers_[0])->window();
+void HiddenLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
   auto data = RTensor2(&data_);
-  auto src = RTensor2(srclayers_[0]->mutable_data(this));
+  auto src = RTensor2(srclayers[0]->mutable_data(this));
   auto weight = RTensor2(weight_->mutable_data());
   for (int t = 0; t < window_; t++) {  // Skip the 1st component
     if (t == 0) {
@@ -184,12 +188,12 @@ void HiddenLayer::ComputeFeature(int flag, Metric* perf) {
   }
 }
 
-void HiddenLayer::ComputeGradient(int flag, Metric* perf) {
+void HiddenLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto data = RTensor2(&data_);
   auto grad = RTensor2(&grad_);
   auto weight = RTensor2(weight_->mutable_data());
   auto gweight = RTensor2(weight_->mutable_grad());
-  auto gsrc = RTensor2(srclayers_[0]->mutable_grad(this));
+  auto gsrc = RTensor2(srclayers[0]->mutable_grad(this));
   gweight = 0;
   TensorContainer<cpu, 1> tmp(Shape1(data_.shape()[1]));
   // Check!!
@@ -210,30 +214,30 @@ LossLayer::~LossLayer() {
   delete class_weight_;
 }
 
-void LossLayer::Setup(const LayerProto& proto, int npartitions) {
-  RNNLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 2);
-  const auto& src = srclayers_[0]->data(this);
+void LossLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  RNNLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 2);
+  const auto& src = srclayers[0]->data(this);
   int max_window = src.shape()[0];
   int vdim = src.count() / max_window;   // Dimension of input
-  int vocab_size = proto.GetExtension(loss_conf).vocab_size();
-  int nclass = proto.GetExtension(loss_conf).nclass();
-  word_weight_ = Param::Create(proto.param(0));
+  int vocab_size = conf.GetExtension(loss_conf).vocab_size();
+  int nclass = conf.GetExtension(loss_conf).nclass();
+  word_weight_ = Param::Create(conf.param(0));
   word_weight_->Setup(vector<int>{vocab_size, vdim});
-  class_weight_ = Param::Create(proto.param(1));
+  class_weight_ = Param::Create(conf.param(1));
   class_weight_->Setup(vector<int>{nclass, vdim});
 
   pword_.resize(max_window);
   pclass_.Reshape(vector<int>{max_window, nclass});
 }
 
-void LossLayer::ComputeFeature(int flag, Metric* perf) {
-  window_ = dynamic_cast<RNNLayer*>(srclayers_[0])->window();
+void LossLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
   auto pclass = RTensor2(&pclass_);
-  auto src = RTensor2(srclayers_[0]->mutable_data(this));
+  auto src = RTensor2(srclayers[0]->mutable_data(this));
   auto word_weight = RTensor2(word_weight_->mutable_data());
   auto class_weight = RTensor2(class_weight_->mutable_data());
-  const float * label = srclayers_[1]->data(this).cpu_data();
+  const float * label = srclayers[1]->data(this).cpu_data();
 
   float loss = 0.f, ppl = 0.f;
   for (int t = 0; t < window_; t++) {
@@ -254,24 +258,21 @@ void LossLayer::ComputeFeature(int flag, Metric* perf) {
     int cid = static_cast<int>(label[t * 4 + 3]);
     CHECK_GT(end, wid);
     CHECK_GE(wid, start);
-    loss += -log(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
-    ppl += log10(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
+    loss_ += -log(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
+    ppl_ += log10(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
   }
-
-  perf->Add("loss", loss, window_);
-  // users can compute the PPL value by 10^(ppl before exp)
-  perf->Add("ppl before exp", ppl, window_);
+  num_ += window_;
 }
 
-void LossLayer::ComputeGradient(int flag, Metric* perf) {
+void LossLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto pclass = RTensor2(&pclass_);
-  auto src = RTensor2(srclayers_[0]->mutable_data(this));
-  auto gsrc = RTensor2(srclayers_[0]->mutable_grad(this));
+  auto src = RTensor2(srclayers[0]->mutable_data(this));
+  auto gsrc = RTensor2(srclayers[0]->mutable_grad(this));
   auto word_weight = RTensor2(word_weight_->mutable_data());
   auto gword_weight = RTensor2(word_weight_->mutable_grad());
   auto class_weight = RTensor2(class_weight_->mutable_data());
   auto gclass_weight = RTensor2(class_weight_->mutable_grad());
-  const float * label = srclayers_[1]->data(this).cpu_data();
+  const float * label = srclayers[1]->data(this).cpu_data();
   gclass_weight = 0;
   gword_weight = 0;
   for (int t = 0; t < window_; t++) {
@@ -299,4 +300,10 @@ void LossLayer::ComputeGradient(int flag, Metric* perf) {
     gsrc[t] += dot(pclass[t], class_weight);
   }
 }
+
+const std::string LossLayer::ToString(bool debug, int flag) {
+  float loss = loss_ / num_;
+  float ppl = exp10(- ppl_ / num_);
+  return "loss = " + std::to_string(loss) + ", ppl = " + std::to_string(ppl);
+}
 }   // end of namespace rnnlm

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/examples/rnnlm/rnnlm.h
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
index b848fa4..ad0918e 100644
--- a/examples/rnnlm/rnnlm.h
+++ b/examples/rnnlm/rnnlm.h
@@ -25,6 +25,7 @@
 #include "./rnnlm.pb.h"
 
 namespace rnnlm {
+using std::vector;
 using singa::LayerProto;
 using singa::Layer;
 using singa::Param;
@@ -57,8 +58,8 @@ class RNNLayer : virtual public Layer {
 class DataLayer : public RNNLayer, public singa::DataLayer {
  public:
   ~DataLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   int max_window() const {
     return max_window_;
   }
@@ -75,9 +76,9 @@ class DataLayer : public RNNLayer, public singa::DataLayer {
  */
 class LabelLayer : public RNNLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override {}
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
 };
 
 
@@ -88,9 +89,9 @@ class LabelLayer : public RNNLayer {
 class EmbeddingLayer : public RNNLayer {
  public:
   ~EmbeddingLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
   const std::vector<Param*> GetParams() const override {
     std::vector<Param*> params{embed_};
     return params;
@@ -111,9 +112,10 @@ class EmbeddingLayer : public RNNLayer {
 class HiddenLayer : public RNNLayer {
  public:
   ~HiddenLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
+
   const std::vector<Param*> GetParams() const override {
     std::vector<Param*> params{weight_};
     return params;
@@ -132,9 +134,11 @@ class HiddenLayer : public RNNLayer {
 class LossLayer : public RNNLayer {
  public:
   ~LossLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
+
+  const std::string ToString(bool debug, int flag) override;
   const std::vector<Param*> GetParams() const override {
     std::vector<Param*> params{word_weight_, class_weight_};
     return params;
@@ -144,6 +148,8 @@ class LossLayer : public RNNLayer {
   std::vector<Blob<float>> pword_;
   Blob<float> pclass_;
   Param* word_weight_, *class_weight_;
+  float loss_, ppl_;
+  int num_;
 };
 }  // namespace rnnlm
 #endif  // EXAMPLES_RNNLM_RNNLM_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/comm/msg.h
----------------------------------------------------------------------
diff --git a/include/comm/msg.h b/include/comm/msg.h
new file mode 100644
index 0000000..50a9b81
--- /dev/null
+++ b/include/comm/msg.h
@@ -0,0 +1,238 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_COMM_MSG_H_
+#define SINGA_COMM_MSG_H_
+
+// TODO(wangwei): make it a compiler argument
+#define USE_ZMQ
+
+#include <utility>
+#ifdef USE_ZMQ
+#include <czmq.h>
+#endif
+
+namespace singa {
+/**
+ * Wrapper to generate message address
+ * @param grp worker/server group id
+ * @param id_or_proc worker/server id or procs id
+ * @param type msg type
+ */
+inline int Addr(int grp, int id_or_proc, int type) {
+  return (grp << 16) | (id_or_proc << 8) | type;
+}
+
+/**
+ * Parse group id from addr.
+ *
+ * @return group id
+ */
+inline int AddrGrp(int addr) {
+  return addr >> 16;
+}
+
+/**
+ * Parse worker/server id from addr.
+ *
+ * @return id
+ */
+inline int AddrID(int addr) {
+  static const int mask = (1 << 8) - 1;
+  return (addr >> 8) & mask;
+}
+
+/**
+ * Parse worker/server procs from addr.
+ *
+ * @return procs id
+ */
+inline int AddrProc(int addr) {
+  return AddrID(addr);
+}
+
+/**
+ * Parse msg type from addr
+ * @return msg type
+ */
+inline int AddrType(int addr) {
+  static const int mask = (1 << 8) -1;
+  return addr & mask;
+}
+
+/**
+ * Msg used to transfer Param info (gradient or value), feature blob, etc
+ * between workers, stubs and servers.
+ *
+ * Each msg has a source addr and dest addr identified by a unique integer.
+ * It is also associated with a target field (value and version) for ease of
+ * getting some meta info (e.g., parameter id) from the msg.
+ *
+ * Other data is added into the message as frames.
+ */
+class Msg {
+ public:
+  ~Msg();
+  Msg();
+  /**
+   * Construct the msg providing source and destination addr.
+   */
+  Msg(int src, int dst);
+  /**
+   * Copy constructor.
+   */
+  Msg(const Msg& msg);
+  /**
+   * Swap the src/dst addr
+   */
+  void SwapAddr();
+  /**
+   * Add a frame (a chunck of bytes) into the message
+   */
+  void AddFrame(const void* addr, int nBytes);
+  /**
+   * @return num of bytes of the current frame.
+   */
+  int FrameSize();
+  /**
+   * @return the pointer to the current frame data.
+   */
+  void* FrameData();
+  /**
+   * @return the data of the current frame as c string
+   */
+  char* FrameStr();
+  /**
+   * Move the cursor to the first frame.
+   */
+  void FirstFrame();
+  /**
+   * Move the cursor to the last frame.
+   */
+  void LastFrame();
+  /**
+   * Move the cursor to the next frame
+   * @return true if the next frame is not NULL; otherwise false
+   */
+  bool NextFrame();
+  /**
+   *  Add a 'format' frame to the msg (like CZMQ's zsock_send).
+   *
+   *  The format is a string that defines the type of each field.
+   *  The format can contain any of these characters, each corresponding to
+   *  one or two arguments:
+   *  i = int (signed)
+   *  1 = uint8_t
+   *  2 = uint16_t
+   *  4 = uint32_t
+   *  8 = uint64_t
+   *  p = void * (sends the pointer value, only meaningful over inproc)
+   *  s = char**
+   *
+   *  Returns size of the added content.
+   */
+  int AddFormatFrame(const char *format, ...);
+  /**
+   *  Parse the current frame added using AddFormatFrame(const char*, ...).
+   *
+   *  The format is a string that defines the type of each field.
+   *  The format can contain any of these characters, each corresponding to
+   *  one or two arguments:
+   *  i = int (signed)
+   *  1 = uint8_t
+   *  2 = uint16_t
+   *  4 = uint32_t
+   *  8 = uint64_t
+   *  p = void * (sends the pointer value, only meaningful over inproc)
+   *  s = char**
+   *
+   *  Returns size of the parsed content.
+   */
+  int ParseFormatFrame(const char* format, ...);
+
+#ifdef USE_ZMQ
+  void ParseFromZmsg(zmsg_t* msg);
+  zmsg_t* DumpToZmsg();
+#endif
+
+  /**
+   * @return msg size in terms of bytes, ignore meta info.
+   */
+  int size() const;
+  /**
+   * Set source addr.
+   * @param addr unique identify one worker/server/stub in the current job
+   */
+  inline void set_src(int addr) { src_ = addr; }
+  /**
+   * @return source addr.
+   */
+  inline int src() const { return src_; }
+  /**
+   * Set destination addr.
+   * @param addr unique identify one worker/server/stub in the current job
+   */
+  inline void set_dst(int addr) { dst_ = addr; }
+  /**
+   * @return dst addr.
+   */
+  inline int dst() const { return dst_; }
+  /**
+   * Set msg type, e.g., kPut, kGet, kUpdate, kRequest
+   */
+  inline void set_type(int type) { type_ = type; }
+  /**
+   * @return msg type.
+   */
+  inline int type() const { return type_; }
+  /**
+   * Set msg target.
+   *
+   * One msg has a target to identify some entity in worker/server/stub.
+   * The target is associated with a version, e.g., Param version.
+   */
+  inline void set_trgt(int val, int version) {
+    trgt_val_ = val;
+    trgt_version_ = version;
+  }
+  inline int trgt_val() const { return trgt_val_; }
+  inline int trgt_version() const { return trgt_version_; }
+
+ protected:
+  int src_ = 0;
+  int dst_ = 0;
+  int type_ = 0;
+  int trgt_val_ = 0;
+  int trgt_version_ = 0;
+#ifdef USE_ZMQ
+  zmsg_t* msg_ = nullptr;
+  zframe_t *frame_ = nullptr;
+#endif
+};
+
+inline void DeleteMsg(Msg** msg) {
+  delete *msg;
+  *msg = nullptr;
+}
+
+}  // namespace singa
+
+#endif  // SINGA_COMM_MSG_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/comm/socket.h
----------------------------------------------------------------------
diff --git a/include/comm/socket.h b/include/comm/socket.h
new file mode 100644
index 0000000..f2ffb4d
--- /dev/null
+++ b/include/comm/socket.h
@@ -0,0 +1,174 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_COMM_SOCKET_H_
+#define SINGA_COMM_SOCKET_H_
+
+#ifdef USE_ZMQ
+#include <czmq.h>
+#endif
+#include <map>
+#include <string>
+#include <vector>
+#include "comm/msg.h"
+
+namespace singa {
+
+const std::string kInprocRouterEndpoint = "inproc://router";
+
+class SocketInterface {
+ public:
+  virtual ~SocketInterface() {}
+  /**
+    * Send a message to connected socket(s), non-blocking. The message
+    * will be deallocated after sending, thus should not be used after
+    * calling Send();
+    *
+    * @param msg The message to be sent
+    * @return 1 for success queuing the message for sending, 0 for failure
+    */
+  virtual int Send(Msg** msg) = 0;
+  /**
+    * Receive a message from any connected socket.
+    *
+    * @return a message pointer if success; nullptr if failure
+    */
+  virtual Msg* Receive() = 0;
+  /**
+   * @return Identifier of the implementation dependent socket. E.g., zsock_t*
+   * for ZeroMQ implementation and rank for MPI implementation.
+   */
+  virtual void* InternalID() const = 0;
+};
+
+class Poller {
+ public:
+  Poller();
+  explicit Poller(SocketInterface* socket);
+  /**
+    * Add a socket for polling; Multiple sockets can be polled together by
+    * adding them into the same poller.
+    */
+  void Add(SocketInterface* socket);
+  /**
+    * Poll for all sockets added into this poller.
+    * @param timeout Stop after this number of mseconds
+    * @return pointer To the socket if it has one message in the receiving
+    * queue; nullptr if no message in any sockets,
+    */
+  SocketInterface* Wait(int duration);
+
+  /**
+   * @return true if the poller is terminated due to process interupt
+   */
+  virtual bool Terminated();
+
+ protected:
+#ifdef USE_ZMQ
+  zpoller_t *poller_;
+  std::map<zsock_t*, SocketInterface*> zsock2Socket_;
+#endif
+};
+
+class Dealer : public SocketInterface {
+ public:
+  /*
+   * @param id Local dealer ID within a procs if the dealer is from worker or
+   * server thread, starts from 1 (0 is used by the router); or the connected
+   * remote procs ID for inter-process dealers from the stub thread.
+   */
+  Dealer();
+  explicit Dealer(int id);
+  ~Dealer() override;
+  /**
+    * Setup the connection with the router.
+    *
+    * @param endpoint Identifier of the router. For intra-process
+    * connection, the endpoint follows the format of ZeroMQ, i.e.,
+    * starting with "inproc://"; in Singa, since each process has one
+    * router, hence we can fix the endpoint to be "inproc://router" for
+    * intra-process. For inter-process, the endpoint follows ZeroMQ's
+    * format, i.e., IP:port, where IP is the connected process.
+    * @return 1 connection sets up successfully; 0 otherwise
+    */
+  int Connect(const std::string& endpoint);
+  int Send(Msg** msg) override;
+  Msg* Receive() override;
+  void* InternalID() const override;
+
+ protected:
+  int id_ = -1;
+#ifdef USE_ZMQ
+  zsock_t* dealer_ = nullptr;
+  zpoller_t* poller_ = nullptr;
+#endif
+};
+
+class Router : public SocketInterface {
+ public:
+  Router();
+  /**
+   * There is only one router per procs, hence its local id is 0 and is not set
+   * explicitly.
+   *
+   * @param bufsize Buffer at most this number of messages
+   */
+  explicit Router(int bufsize);
+  ~Router() override;
+  /**
+   * Setup the connection with dealers.
+   *
+   * It automatically binds to the endpoint for intra-process communication,
+   * i.e., "inproc://router".
+   *
+   * @param endpoint The identifier for the Dealer socket in other process
+   * to connect. It has the format IP:Port, where IP is the host machine.
+   * If endpoint is empty, it means that all connections are
+   * intra-process connection.
+   * @return number of connected dealers.
+   */
+  int Bind(const std::string& endpoint);
+  /**
+   * If the destination socket has not connected yet, buffer this the message.
+   */
+  int Send(Msg** msg) override;
+  Msg* Receive() override;
+  void* InternalID() const override;
+
+ protected:
+  int nBufmsg_ = 0;
+  int bufsize_ = 100;
+#ifdef USE_ZMQ
+  zsock_t* router_ = nullptr;
+  zpoller_t* poller_ = nullptr;
+  std::map<int, zframe_t*> id2addr_;
+  std::map<int, std::vector<zmsg_t*>> bufmsg_;
+#endif
+};
+
+#ifdef USE_MPI
+// TODO(wangsheng): add intra-process communication using shared queue
+std::vector<SafeQueue*> MPIQueues;
+#endif
+
+}  // namespace singa
+
+#endif  // SINGA_COMM_SOCKET_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/communication/msg.h
----------------------------------------------------------------------
diff --git a/include/communication/msg.h b/include/communication/msg.h
deleted file mode 100644
index 217d89a..0000000
--- a/include/communication/msg.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_COMMUNICATION_MSG_H_
-#define SINGA_COMMUNICATION_MSG_H_
-
-// TODO(wangwei): make it a compiler argument
-#define USE_ZMQ
-
-#include <utility>
-#ifdef USE_ZMQ
-#include <czmq.h>
-#endif
-
-namespace singa {
-/**
- * Wrapper to generate message address
- * @param grp worker/server group id
- * @param id_or_proc worker/server id or procs id
- * @param type msg type
- */
-inline int Addr(int grp, int id_or_proc, int type) {
-  return (grp << 16) | (id_or_proc << 8) | type;
-}
-
-/**
- * Parse group id from addr.
- *
- * @return group id
- */
-inline int AddrGrp(int addr) {
-  return addr >> 16;
-}
-
-/**
- * Parse worker/server id from addr.
- *
- * @return id
- */
-inline int AddrID(int addr) {
-  static const int mask = (1 << 8) - 1;
-  return (addr >> 8) & mask;
-}
-
-/**
- * Parse worker/server procs from addr.
- *
- * @return procs id
- */
-inline int AddrProc(int addr) {
-  return AddrID(addr);
-}
-
-/**
- * Parse msg type from addr
- * @return msg type
- */
-inline int AddrType(int addr) {
-  static const int mask = (1 << 8) -1;
-  return addr & mask;
-}
-
-/**
- * Msg used to transfer Param info (gradient or value), feature blob, etc
- * between workers, stubs and servers.
- *
- * Each msg has a source addr and dest addr identified by a unique integer.
- * It is also associated with a target field (value and version) for ease of
- * getting some meta info (e.g., parameter id) from the msg.
- *
- * Other data is added into the message as frames.
- */
-class Msg {
- public:
-  ~Msg();
-  Msg();
-  /**
-   * Construct the msg providing source and destination addr.
-   */
-  Msg(int src, int dst);
-  /**
-   * Copy constructor.
-   */
-  Msg(const Msg& msg);
-  /**
-   * Swap the src/dst addr
-   */
-  void SwapAddr();
-  /**
-   * Add a frame (a chunck of bytes) into the message
-   */
-  void AddFrame(const void* addr, int nBytes);
-  /**
-   * @return num of bytes of the current frame.
-   */
-  int FrameSize();
-  /**
-   * @return the pointer to the current frame data.
-   */
-  void* FrameData();
-  /**
-   * @return the data of the current frame as c string
-   */
-  char* FrameStr();
-  /**
-   * Move the cursor to the first frame.
-   */
-  void FirstFrame();
-  /**
-   * Move the cursor to the last frame.
-   */
-  void LastFrame();
-  /**
-   * Move the cursor to the next frame
-   * @return true if the next frame is not NULL; otherwise false
-   */
-  bool NextFrame();
-  /**
-   *  Add a 'format' frame to the msg (like CZMQ's zsock_send).
-   *
-   *  The format is a string that defines the type of each field.
-   *  The format can contain any of these characters, each corresponding to
-   *  one or two arguments:
-   *  i = int (signed)
-   *  1 = uint8_t
-   *  2 = uint16_t
-   *  4 = uint32_t
-   *  8 = uint64_t
-   *  p = void * (sends the pointer value, only meaningful over inproc)
-   *  s = char**
-   *
-   *  Returns size of the added content.
-   */
-  int AddFormatFrame(const char *format, ...);
-  /**
-   *  Parse the current frame added using AddFormatFrame(const char*, ...).
-   *
-   *  The format is a string that defines the type of each field.
-   *  The format can contain any of these characters, each corresponding to
-   *  one or two arguments:
-   *  i = int (signed)
-   *  1 = uint8_t
-   *  2 = uint16_t
-   *  4 = uint32_t
-   *  8 = uint64_t
-   *  p = void * (sends the pointer value, only meaningful over inproc)
-   *  s = char**
-   *
-   *  Returns size of the parsed content.
-   */
-  int ParseFormatFrame(const char* format, ...);
-
-#ifdef USE_ZMQ
-  void ParseFromZmsg(zmsg_t* msg);
-  zmsg_t* DumpToZmsg();
-#endif
-
-  /**
-   * @return msg size in terms of bytes, ignore meta info.
-   */
-  int size() const;
-  /**
-   * Set source addr.
-   * @param addr unique identify one worker/server/stub in the current job
-   */
-  inline void set_src(int addr) { src_ = addr; }
-  /**
-   * @return source addr.
-   */
-  inline int src() const { return src_; }
-  /**
-   * Set destination addr.
-   * @param addr unique identify one worker/server/stub in the current job
-   */
-  inline void set_dst(int addr) { dst_ = addr; }
-  /**
-   * @return dst addr.
-   */
-  inline int dst() const { return dst_; }
-  /**
-   * Set msg type, e.g., kPut, kGet, kUpdate, kRequest
-   */
-  inline void set_type(int type) { type_ = type; }
-  /**
-   * @return msg type.
-   */
-  inline int type() const { return type_; }
-  /**
-   * Set msg target.
-   *
-   * One msg has a target to identify some entity in worker/server/stub.
-   * The target is associated with a version, e.g., Param version.
-   */
-  inline void set_trgt(int val, int version) {
-    trgt_val_ = val;
-    trgt_version_ = version;
-  }
-  inline int trgt_val() const { return trgt_val_; }
-  inline int trgt_version() const { return trgt_version_; }
-
- protected:
-  int src_ = 0;
-  int dst_ = 0;
-  int type_ = 0;
-  int trgt_val_ = 0;
-  int trgt_version_ = 0;
-#ifdef USE_ZMQ
-  zmsg_t* msg_ = nullptr;
-  zframe_t *frame_ = nullptr;
-#endif
-};
-
-inline void DeleteMsg(Msg** msg) {
-  delete *msg;
-  *msg = nullptr;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_COMMUNICATION_MSG_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/communication/socket.h
----------------------------------------------------------------------
diff --git a/include/communication/socket.h b/include/communication/socket.h
deleted file mode 100644
index 3590577..0000000
--- a/include/communication/socket.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_COMMUNICATION_SOCKET_H_
-#define SINGA_COMMUNICATION_SOCKET_H_
-
-#ifdef USE_ZMQ
-#include <czmq.h>
-#endif
-#include <map>
-#include <string>
-#include <vector>
-#include "communication/msg.h"
-
-namespace singa {
-
-const std::string kInprocRouterEndpoint = "inproc://router";
-
-class SocketInterface {
- public:
-  virtual ~SocketInterface() {}
-  /**
-    * Send a message to connected socket(s), non-blocking. The message
-    * will be deallocated after sending, thus should not be used after
-    * calling Send();
-    *
-    * @param msg The message to be sent
-    * @return 1 for success queuing the message for sending, 0 for failure
-    */
-  virtual int Send(Msg** msg) = 0;
-  /**
-    * Receive a message from any connected socket.
-    *
-    * @return a message pointer if success; nullptr if failure
-    */
-  virtual Msg* Receive() = 0;
-  /**
-   * @return Identifier of the implementation dependent socket. E.g., zsock_t*
-   * for ZeroMQ implementation and rank for MPI implementation.
-   */
-  virtual void* InternalID() const = 0;
-};
-
-class Poller {
- public:
-  Poller();
-  explicit Poller(SocketInterface* socket);
-  /**
-    * Add a socket for polling; Multiple sockets can be polled together by
-    * adding them into the same poller.
-    */
-  void Add(SocketInterface* socket);
-  /**
-    * Poll for all sockets added into this poller.
-    * @param timeout Stop after this number of mseconds
-    * @return pointer To the socket if it has one message in the receiving
-    * queue; nullptr if no message in any sockets,
-    */
-  SocketInterface* Wait(int duration);
-
-  /**
-   * @return true if the poller is terminated due to process interupt
-   */
-  virtual bool Terminated();
-
- protected:
-#ifdef USE_ZMQ
-  zpoller_t *poller_;
-  std::map<zsock_t*, SocketInterface*> zsock2Socket_;
-#endif
-};
-
-class Dealer : public SocketInterface {
- public:
-  /*
-   * @param id Local dealer ID within a procs if the dealer is from worker or
-   * server thread, starts from 1 (0 is used by the router); or the connected
-   * remote procs ID for inter-process dealers from the stub thread.
-   */
-  Dealer();
-  explicit Dealer(int id);
-  ~Dealer() override;
-  /**
-    * Setup the connection with the router.
-    *
-    * @param endpoint Identifier of the router. For intra-process
-    * connection, the endpoint follows the format of ZeroMQ, i.e.,
-    * starting with "inproc://"; in Singa, since each process has one
-    * router, hence we can fix the endpoint to be "inproc://router" for
-    * intra-process. For inter-process, the endpoint follows ZeroMQ's
-    * format, i.e., IP:port, where IP is the connected process.
-    * @return 1 connection sets up successfully; 0 otherwise
-    */
-  int Connect(const std::string& endpoint);
-  int Send(Msg** msg) override;
-  Msg* Receive() override;
-  void* InternalID() const override;
-
- protected:
-  int id_ = -1;
-#ifdef USE_ZMQ
-  zsock_t* dealer_ = nullptr;
-  zpoller_t* poller_ = nullptr;
-#endif
-};
-
-class Router : public SocketInterface {
- public:
-  Router();
-  /**
-   * There is only one router per procs, hence its local id is 0 and is not set
-   * explicitly.
-   *
-   * @param bufsize Buffer at most this number of messages
-   */
-  explicit Router(int bufsize);
-  ~Router() override;
-  /**
-   * Setup the connection with dealers.
-   *
-   * It automatically binds to the endpoint for intra-process communication,
-   * i.e., "inproc://router".
-   *
-   * @param endpoint The identifier for the Dealer socket in other process
-   * to connect. It has the format IP:Port, where IP is the host machine.
-   * If endpoint is empty, it means that all connections are
-   * intra-process connection.
-   * @return number of connected dealers.
-   */
-  int Bind(const std::string& endpoint);
-  /**
-   * If the destination socket has not connected yet, buffer this the message.
-   */
-  int Send(Msg** msg) override;
-  Msg* Receive() override;
-  void* InternalID() const override;
-
- protected:
-  int nBufmsg_ = 0;
-  int bufsize_ = 100;
-#ifdef USE_ZMQ
-  zsock_t* router_ = nullptr;
-  zpoller_t* poller_ = nullptr;
-  std::map<int, zframe_t*> id2addr_;
-  std::map<int, std::vector<zmsg_t*>> bufmsg_;
-#endif
-};
-
-#ifdef USE_MPI
-// TODO(wangsheng): add intra-process communication using shared queue
-std::vector<SafeQueue*> MPIQueues;
-#endif
-
-}  // namespace singa
-
-#endif  // SINGA_COMMUNICATION_SOCKET_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/driver.h
----------------------------------------------------------------------
diff --git a/include/driver.h b/include/driver.h
index b33c7cc..9ae4b27 100644
--- a/include/driver.h
+++ b/include/driver.h
@@ -22,6 +22,7 @@
 #ifndef SINGA_DRIVER_H_
 #define SINGA_DRIVER_H_
 
+#include <vector>
 #include "proto/job.pb.h"
 #include "proto/singa.pb.h"
 #include "utils/factory.h"
@@ -29,20 +30,70 @@
 #include "utils/singleton.h"
 #include "utils/updater.h"
 #include "neuralnet/layer.h"
-#include "trainer/worker.h"
+#include "./worker.h"
+#include "./server.h"
 
 namespace singa {
-
+using std::vector;
 class Driver {
  public:
   /**
-   * Init SINGA, including init glog, parse job id and job conf from cmd line,
-   * and register built-in layer, worker, updater, param subclasses.
+   * Init SINGA
+   * - init glog
+   * - parse job id and job conf from cmd line
+   * - register built-in layer, worker, updater, param subclasses.
    *
    * May be used for MPI init if it is used for message passing.
    */
   void Init(int argc, char** argv);
   /**
+   * Update job configuration and call Train(const JobProto&) to start the
+   * training.
+   *
+   * It sets up the logging path and checkpoing files (if resume), and checks
+   * the existence of the workspace folder .
+   *
+   * @param[in] resume if true resume the training from the latest checkpoint
+   * files.
+   * @param[in] job_conf job configuration.
+   */
+  void Train(bool resume, const JobProto& job_conf);
+  /**
+   * Create workers and servers to conduct the training.
+   *
+   * @param[in] job_conf job configuration with all necessary fields set (e.g.,
+   * by Train(bool, const JobProto&).
+   */
+  void Train(const JobProto& job_conf);
+  /**
+   * Setting the checkpoint field of the job configuration to resume training.
+   *
+   * The checkpoint folder will be searched to get the files for the latest
+   * checkpoint, which will be added into the checkpoint field. The workers
+   * would then load the values of params from the checkpoint files.
+   *
+   * @param job_conf job configuration
+   */
+  void SetupForResume(JobProto* job_conf);
+  /**
+   * Create server instances.
+   *
+   * @param[in] job_conf job configuration.
+   * @param[in] net training neural network.
+   * @return server instances
+   */
+  const vector<Server*> CreateServers(const JobProto& job_conf, NeuralNet* net);
+  /**
+   * Create workers instances.
+   * @param[in] job_conf job configuration.
+   * @param[in] net training neural network.
+   * @return worker instances
+   */
+  const vector<Worker*> CreateWorkers(const JobProto& job_conf, NeuralNet* net);
+
+
+  /*********** Subclasses registers *************************/
+  /**
    * Register a Layer subclass.
    *
    * @param type layer type ID. If called to register built-in subclasses,
@@ -103,12 +154,7 @@ class Driver {
   template<typename Subclass, typename Type>
   int RegisterParamGenerator(const Type& type);
 
-  /**
-   * Submit the job configuration for starting the job.
-   * @param resume resume from last checkpoint if true.
-   * @param job job configuration
-   */
-  void Submit(bool resume, const JobProto& job);
+  /****************** Access function ********************/
   /**
    * @return job ID which is generated by zookeeper and passed in by the
    * launching script.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/connection_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/connection_layer.h b/include/neuralnet/connection_layer.h
index 75f399c..1976fb9 100644
--- a/include/neuralnet/connection_layer.h
+++ b/include/neuralnet/connection_layer.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -56,12 +56,12 @@ class BridgeLayer : virtual public ConnectionLayer {
  */
 class BridgeDstLayer : public BridgeLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override {
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override {
     // reset ready_ for next iteration.
     ready_ = false;
   }
-  void ComputeGradient(int flag, Metric* perf) override {}
+  void ComputeGradient(int flag,  const vector<Layer*>& srclayers) override {}
   bool is_bridgedstlayer() const {
     return true;
   }
@@ -73,25 +73,32 @@ class BridgeDstLayer : public BridgeLayer {
  */
 class BridgeSrcLayer : public BridgeLayer {
  public:
-  void ComputeFeature(int flag, Metric* perf) override {}
-  void ComputeGradient(int flag, Metric* perf) override {
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override {
+    CHECK_GE(srclayers.size(), 1);
+    srclayer_ = srclayers.at(0);
+  }
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override {}
+  void ComputeGradient(int flag,  const vector<Layer*>& srclayers) override {
     ready_ = false;
   }
   const Blob<float>& data(const Layer* from) const override {
-    return srclayers_[0]->data(this);
+    return srclayer_->data(this);
   }
   Blob<float>* mutable_data(const Layer* from) override {
-    return srclayers_[0]->mutable_data(this);
+    return srclayer_->mutable_data(this);
   }
   const Blob<float>& grad(const Layer* from) const override {
-    return srclayers_[0]->grad(this);
+    return srclayer_->grad(this);
   }
   Blob<float>* mutable_grad(const Layer* from) override {
-    return srclayers_[0]->mutable_grad(this);
+    return srclayer_->mutable_grad(this);
   }
   bool is_bridgesrclayer() const override {
     return true;
   }
+
+ private:
+  Layer* srclayer_;
 };
 
 
@@ -103,9 +110,9 @@ class BridgeSrcLayer : public BridgeLayer {
  */
 class ConcateLayer : public ConnectionLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 /**
@@ -116,9 +123,9 @@ class ConcateLayer : public ConnectionLayer {
  */
 class SliceLayer : public ConnectionLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  private:
   std::vector<Blob<float>> datavec_;
@@ -136,9 +143,9 @@ class SliceLayer : public ConnectionLayer {
  */
 class SplitLayer : public ConnectionLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  protected:
   Blob<float> grads_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/input_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/input_layer.h b/include/neuralnet/input_layer.h
index 709912d..b5f2dd4 100644
--- a/include/neuralnet/input_layer.h
+++ b/include/neuralnet/input_layer.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -32,8 +32,8 @@
  *
  * The feature loading phase can be implemented using a single layer or
  * separated into DataLayer (for loading features as records) and ParserLayer
- * (for parsing features from records). SINGA has provided some built-in layers
- * for DataLayer and ParserLayer.
+ * (for parsing features from records). SINGA has provided some subclasses of
+ * DataLayer and ParserLayer.
  *
  * Data prefetching can be implemented as a sub-class of InputLayer.
  * SINGA provides a built-in PrefetchLayer which embeds DataLayer and
@@ -41,20 +41,15 @@
  */
 namespace singa {
 /**
- * Base layer for reading records from local Shard, HDFS, lmdb, etc.
+ * Base layer for reading ::Record  from local Shard, HDFS, lmdb, etc.
  */
 class DataLayer: virtual public InputLayer {
  public:
-  void ComputeGradient(int flag, Metric* perf) override {}
-  Blob<float>* mutable_data(const Layer* layer) override {
-    return nullptr;
-  }
-  Blob<float>* mutable_grad(const Layer* layer) override {
-    return nullptr;
-  }
+  Blob<float>* mutable_data(const Layer* layer) override { return nullptr; }
   ConnectionType dst_layer_connection() const override {
     return kOneToMany;
   }
+
   inline int batchsize() const { return batchsize_; }
   virtual const Record& sample() const {
     return sample_;
@@ -81,8 +76,8 @@ class ShardDataLayer : public DataLayer {
  public:
   ~ShardDataLayer();
 
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
 
  private:
   DataShard* shard_;
@@ -94,9 +89,9 @@ class LMDBDataLayer : public DataLayer {
  public:
   ~LMDBDataLayer();
 
-  void Setup(const LayerProto& proto, int npartitions) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   void OpenLMDB(const std::string& path);
-  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ConvertCaffeDatumToRecord(const CaffeDatum& datum,
                                  SingleLabelImageRecord* record);
 
@@ -114,8 +109,8 @@ class LMDBDataLayer : public DataLayer {
  */
 class ParserLayer : public InputLayer {
  public:
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override {}
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
   ConnectionType dst_layer_connection() const override {
     return kOneToMany;
   }
@@ -124,13 +119,6 @@ class ParserLayer : public InputLayer {
    */
   virtual void ParseRecords(int flag, const std::vector<Record>& records,
       Blob<float>* blob) = 0;
-  Blob<float>* mutable_grad(const Layer* layer) override {
-    return nullptr;
-  }
-  const Blob<float>& grad(const Layer* from) const  override {
-    CHECK(false) << "Parser layer has not gradient blob";
-    return grad_;
-  }
 };
 
 /**
@@ -138,7 +126,7 @@ class ParserLayer : public InputLayer {
  */
 class LabelLayer : public ParserLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   void ParseRecords(int flag, const std::vector<Record>& records,
                     Blob<float>* blob) override;
 };
@@ -148,7 +136,7 @@ class LabelLayer : public ParserLayer {
  */
 class MnistLayer : public ParserLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   void ParseRecords(int flag, const std::vector<Record>& records,
                     Blob<float>* blob) override;
 
@@ -161,7 +149,7 @@ class MnistLayer : public ParserLayer {
  */
 class RGBImageLayer : public ParserLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   void ParseRecords(int flag, const std::vector<Record>& records,
                     Blob<float>* blob) override;
 
@@ -181,8 +169,8 @@ class RGBImageLayer : public ParserLayer {
 class PrefetchLayer : public Layer {
  public:
   ~PrefetchLayer();
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override {}
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
 
  protected:
   std::thread thread_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index 05377b1..bf83163 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -33,16 +33,22 @@
 #include "utils/param.h"
 
 namespace singa {
-
+using std::vector;
 /**
  * Base layer class.
  *
- * Children should implement at least
+ * Subclasses should implement at least
  * Layer::ComputeFeature() and Layer::ComputGradient()
- * functions for contrastive-divergence/back-propagation algorithm.
+ * functions in accordance with the NeuralNet::TrainOneBatch function.
  */
 class Layer {
  public:
+  /**
+   * Create a sub-layer instance based on proto.type();
+   *
+   * @param proto configuration of the layer instance.
+   * @return pointer to the newly created layer instance.
+   */
   static Layer* Create(const LayerProto& proto);
 
   Layer() {}
@@ -50,49 +56,51 @@ class Layer {
   /**
    * Setup layer properties.
    *
-   * Setup the shapes for data and parameters, also setup some properties
-   * based on the layer configuration and connected layers.
+   * Setup members e.g., shapes of Param objects based on the layer
+   * configuration and connected layers.
+   * It should check the partition setting when setup the properties.
    *
-   * @param proto layer configuration.
-   * @param npartitions num of total partitions of the original layer. This
-   * layer should be setup as one partition.
+   * @param conf layer configuration.
+   * @param srclayers source layers that connect to this layer.
    */
-  virtual void Setup(const LayerProto& proto, int npartitions = 1) {
-    CHECK_GE(npartitions, 1);
-    layer_proto_ = proto;
+  virtual void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+    layer_conf_ = conf;
   }
   /**
    * Compute features of this layer based on connected layers.
    *
-   * @param perf pointer to Metric obj for collect and aggregate performance
+   * @param[in] flag set by the TrainOneBatch function, e.g., to indicate the
+   * running phase (kForward|kTrain, kForward|kTest, etc).
+   * @param[in] srclayers source layers that connect to this layer.
    */
-  virtual void ComputeFeature(int flag, Metric* perf) = 0;
+  virtual void ComputeFeature(int flag, const vector<Layer*>& srclayers) = 0;
   /**
-   * Compute gradients for parameters and connected layers.
-   * @param flag used to get the calling phase, e.g., forward of training
-   * (kForward | kTrain)
-   * @param flag used to get the calling phase, e.g., forward of training
+   * Compute gradients for parameters associated with this layer.
+   * It may also compute the gradients of the loss w.r.t the source layers.
+   *
+   * \copydetails ComputeFeature().
    */
-  virtual void ComputeGradient(int flag, Metric* perf) = 0;
+  virtual void ComputeGradient(int flag, const vector<Layer*>& srclayers) = 0;
   /**
-   * Layers that have paramters must override this function.
-   * @param flag used to get the calling phase, e.g., forward of training
-   * (kForward | kTrain)
-   * @return parameters associated with this layer
+   * Layers that have paramters must override this function to return all Param
+   * objects associated with this layer.
+   *
+   * @return parameters associated with this layer.
    */
   virtual const std::vector<Param*> GetParams() const {
     return std::vector<Param*> {};
   }
   /**
-   * Return the connection type between one neuron of this layer and
-   * its source layer.
+   * Return the connection type between one neuron of this layer and its source
+   * layer.
+   *
    * Currently support two connection types: kOneToOne, and kOneToAll.
-   * kOneToOne indicates the neuron depends on only one neuron from src layer.
-   * kOneToAll indicates the neuron depends on all neurons from src layer.
+   * - kOneToOne indicates the neuron depends on only one neuron from src layer.
+   * - kOneToAll indicates the neuron depends on all neurons from src layer.
    * TODO(wangwei) support kOneToMany.
    *
-   * @param k index of source layer (current only support k = 0.
-   * @param connection type.
+   * @param[in] k index of source layer, current only support k = 0.
+   * @return connection type.
    */
   virtual ConnectionType src_neuron_connection(int k) const {
     // CHECK_LT(k, srclayers_.size());
@@ -102,89 +110,101 @@ class Layer {
    * Return the connection type of this layer and all dst layers.
    *
    * Currently support two connection types: kOneToOne, and kOneToMany.
-   * kOneToOne indicates the users implement the ComputeFeature and
-   * ComputeGradient function considering only one dest layer. In this case,
+   * - kOneToOne indicates the users implement the ComputeFeature and
+   * ComputeGradient function considering only one dst layer. In this case,
    * a SplitLayer will be added automatically to connect this layer with all
    * dest layer.
-   * kOneToMany indicates the users has already considered multiple dest layers
-   * in the implementation.
+   * - kOneToMany indicates this layer has already considered multiple dst
+   *   layers in the implementation.
+   *
    * @return connection type default is kOneToOne.
    */
   virtual ConnectionType dst_layer_connection() const {
     return kOneToOne;
   }
   /**
-   * For print debug info about each layer, e.g., norm of feature vector,
-   * norm of parameters.
+   * To display layer info, e.g., aggreated loss/accuracy, or norm of feature
+   * vector and norm of parameters.
    *
-   * @param step training/test/validation step
-   * @param flag used to get the calling phase, e.g., forward of training
-   * (kForward | kTrain)
-   * @return debug info about this layer.
+   * @param[in] debug whether print the debug info
+   * @param[in] flag used to get the calling phase, e.g., forward of training
+   * (kForward | kTrain).
+   * @return info string about this layer, which is printed into the log.
    */
-  virtual const std::string DebugString(int step, int flag);
+  virtual const std::string ToString(bool debug, int flag);
   /**
-   * @return partition dimension of this layer.
-   * -1 for no partition;
-   *  0 for partition the mini-batch into sub-mini-batch.
-   *  1 for partition the layer feature vector into sub-vector.
+   * @return partition dimension of this layer,
+   * - -1 for no partition.
+   * -  0 for partition on the data dimension, i.e., partitioning the mini-batch
+   *    into sub-mini-batches.
+   * -  1 for partition this layer on feature dimension, i.e., the feature
+   *    vector of each instance is partitioned into sub-vectors.
    */
   inline int partition_dim() const {
-    CHECK_LE(layer_proto_.partition_dim(), 1);
-    return layer_proto_.partition_dim();
+    CHECK_LE(layer_conf_.partition_dim(), 1);
+    return layer_conf_.partition_dim();
   }
-  inline int partition_id() const { return layer_proto_.partition_id(); }
-  inline int type() const { return layer_proto_.type(); }
   /**
-   * Return name of this layer
+   * @return the partition ID (i.e., the worker ID to whom is layer is
+   * dispatched) of this layer, which is a sublayer partitioned from the
+   * original layer.
+   */
+  inline int partition_id() const { return layer_conf_.partition_id(); }
+  /**
+   * @return total number of partitions (i.e., sub-layers) of the original
+   * layer of this layer.
+   */
+  inline int num_partitions() const { return layer_conf_.num_partitions(); }
+  /**
+   * @return the type of this layer, only valid for built-in layer (types).
    */
-  inline const std::string &name() const { return layer_proto_.name(); }
+  inline LayerType type() const { return layer_conf_.type(); }
   /**
-   * @return name of src data blob, used by prefetch layer to locate the data
-   * blob in parser layers; The default value is "unknown"; If the
-   * src layer is the prefetch layer and there are more than one parser layers,
-   * this value be set.
-  const std::string &datablob() const {
-    return layer_proto_.datablob();
+   * @return user-defined layer type.
+   */
+  inline const std::string& user_type() const {
+    return layer_conf_.user_type();
   }
+  /**
+   * Return name of this layer
    */
+  inline const std::string& name() const { return layer_conf_.name(); }
   /**
-   * @return a const ref for Blob storing neuron values of this layer for BP
+   * @param[in] from pointer to one of the dst layer. For some layers, they have
+   * more than one data Blob. In this case, this argument identifies the layer
+   * that is requesting the data Blob.
+   * @return a const ref for Blob storing feature values of this layer.
    */
   virtual const Blob<float>& data(const Layer* from) const {
     return data_;
   }
+  /**
+   * @see data().
+   * @return the pointer to the Blob storing feature values of this layer.
+   */
   virtual Blob<float>* mutable_data(const Layer* from) {
     return &data_;
   }
+  /**
+   * @see data().
+   * @return the const ref of the Blob for the gradient of this layer, mainly
+   * used in BP algorithm.
+   */
   virtual const Blob<float>& grad(const Layer* from) const {
     return grad_;
   }
   /**
-   * @return a pointer to storing neuron grads of this layer for BP
+   * @see data().
+   * @return a pointer to the Blob storing gradients of this layer, mainly
+   * used in BP algorithm.
    */
   virtual Blob<float>* mutable_grad(const Layer* from) {
     return &grad_;
   }
-  /**
-   * return LayerS that connected to this layer
-   */
-  inline const std::vector<Layer*> srclayers() const { return srclayers_; }
-  /**
-   * return LayerS that this layer connected to
-   */
-  inline const std::vector<Layer*> dstlayers() const { return dstlayers_; }
-  inline int srclayers_size() const { return srclayers_.size(); }
-  inline int dstlayers_size() const { return dstlayers_.size(); }
-  inline void clear_dstlayers() { dstlayers_.clear(); }
-  inline void clear_srclayers() { srclayers_.clear(); }
-  inline void add_srclayer(Layer* src) { srclayers_.push_back(src); }
-  inline void add_dstlayer(Layer* dst) { dstlayers_.push_back(dst); }
 
  protected:
-  LayerProto layer_proto_;
+  LayerProto layer_conf_;
   Blob<float> data_, grad_;
-  std::vector<Layer*> srclayers_, dstlayers_;
 };
 
 /**
@@ -199,29 +219,59 @@ class ConnectionLayer : virtual public Layer {
  * parsing records.
  */
 class InputLayer : virtual public Layer {
-  // defined as a layer category
+ public:
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
+  Blob<float>* mutable_grad(const Layer* layer) override {
+    // LOG(FATAL) << "Loss layer has no gradient blob";
+    return nullptr;
+  }
+  const Blob<float>& grad(const Layer* from) const override {
+    // LOG(FATAL) << "Loss layer has no gradient blob";
+    return grad_;
+  }
 };
 
 
+/**
+ * Base layer for calculating loss and doing BackPropagation.
+ */
+class LossLayer : virtual public Layer {
+ public:
+  const std::string ToString(bool debug, int flag) override;
+  Blob<float>* mutable_grad(const Layer* layer) override {
+    LOG(FATAL) << "Loss layer has no gradient blob";
+    return nullptr;
+  }
+  const Blob<float>& grad(const Layer* from) const override {
+    LOG(FATAL) << "Loss layer has no gradient blob";
+    return grad_;
+  }
+ protected:
+  Metric metric_;
+};
+
+/**
+ * Base layer for feature transformation, e.g., ConvolutionLayer, PoolingLayer,
+ * etc.
+ */
 class NeuronLayer : virtual public Layer {
   // defined as a layer category
 };
 
 /**
- * Base layer for calculating loss and other metrics, e.g., precison.
+ * Base layer for collecting features into disk file, HTTP stream, etc.
  */
-class LossLayer : virtual public Layer {
+class OutpuLayer : virtual public Layer {
  public:
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
   Blob<float>* mutable_grad(const Layer* layer) override {
+    LOG(FATAL) << "Loss layer has no gradient blob";
     return nullptr;
   }
   const Blob<float>& grad(const Layer* from) const override {
     LOG(FATAL) << "Loss layer has no gradient blob";
     return grad_;
   }
-
- protected:
-  Blob<float> metric_;
 };
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/loss_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/loss_layer.h b/include/neuralnet/loss_layer.h
index 3af0b46..a48a8e7 100644
--- a/include/neuralnet/loss_layer.h
+++ b/include/neuralnet/loss_layer.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,30 +22,36 @@
 #ifndef SINGA_NEURALNET_LOSS_LAYER_H_
 #define SINGA_NEURALNET_LOSS_LAYER_H_
 
+#include <vector>
 #include "neuralnet/layer.h"
 
 /**
- * \file this file includes the declarations of layers that inherit the base
+ * @file this file includes the declarations of layers that inherit the base
  * LossLayer for measuring the objective training loss.
  */
 namespace singa {
+using std::vector;
 /**
- * Squared Euclidean loss as 0.5 ||predict - ground_truth||^2.
+ * Squared Euclidean loss as @f$0.5 ||p - t||^2@f$, where p is for prediction
+ * t is for ground truth.
  */
 class EuclideanLossLayer : public LossLayer {
  public:
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 /**
- * Cross-entropy loss applied to the probabilities after Softmax.
+ * Cross-entropy loss applied to the probabilities computed from Softmax.
+ * @f$ L_i = -log P_{t_i}, t_i\in [0, C] @f$ is the label for the i-th object,
+ * C is the total number of classes.
  */
 class SoftmaxLossLayer : public LossLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
   /**
    * softmax is not recommendeded for partition because it requires the whole

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/neuralnet.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/neuralnet.h b/include/neuralnet/neuralnet.h
index 693fe19..a202f44 100644
--- a/include/neuralnet/neuralnet.h
+++ b/include/neuralnet/neuralnet.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,6 +24,7 @@
 
 #include <string>
 #include <vector>
+#include <unordered_map>
 
 #include "neuralnet/layer.h"
 #include "proto/job.pb.h"
@@ -31,7 +32,6 @@
 #include "utils/graph.h"
 
 namespace singa {
-
 /**
  * The neural network is constructed from user configurations in NetProto.
  *
@@ -60,23 +60,27 @@ class NeuralNet {
    * @param netproto neural net config
    * @param npartitions num of partitions. 1 for no partitioning.
    */
-  NeuralNet(NetProto netproto, int npartitions);
+  NeuralNet(NetProto net_conf, int num_partitions);
   ~NeuralNet();
   /**
    * To display the adjacency layers
-   */
   std::string ToAdjacency();
+   */
   /**
    * Share memory of parameter values from other neuralnet
    */
   void ShareParamsFrom(NeuralNet* other);
-  inline const std::vector<Layer*>& layers() { return layers_; }
+  inline const std::vector<Layer*>& layers() const { return layers_; }
   inline const std::vector<Param*>& params() const { return params_; }
   inline Layer* name2layer(std::string name) const {
-    if (name2layer_.find(name) != name2layer_.end())
-      return name2layer_.at(name);
-    else
-      return nullptr;
+    CHECK(name2layer_.find(name) != name2layer_.end())
+      << "No layer with name " << name;
+    return name2layer_.at(name);
+  }
+  inline const std::vector<Layer*>& srclayers(const Layer* layer) const {
+    CHECK(src_map_.find(layer) != src_map_.end())
+      << "layer (" << layer->name() << " ) has no source layers";
+    return src_map_.at(layer);
   }
   inline Param* paramid2param(int id) const { return paramid2param_.at(id); }
 
@@ -90,11 +94,11 @@ class NeuralNet {
    * @npartitions
    * @return neural net graph
    */
-  Graph* CreateGraph(const NetProto& netproto, int npartitions);
+  Graph* CreateGraph(const NetProto& netproto, int num_partitions);
   /**
    * Create neural net from graph, one layer per node.
    */
-  void CreateNetFromGraph(Graph* graph, int npartitions);
+  void CreateNetFromGraph(Graph* graph, int num_partitions);
   /**
    * prepare data structures, e.g., params_, layers_, etc.
    */
@@ -104,8 +108,9 @@ class NeuralNet {
   std::vector<Layer*> layers_;
   std::vector<Param*> params_;
 
-  std::map<std::string, Layer*> name2layer_;
-  std::map<int, Param*> paramid2param_;
+  std::unordered_map<std::string, Layer*> name2layer_;
+  std::unordered_map<int, Param*> paramid2param_;
+  std::unordered_map<const Layer*, std::vector<Layer*>> src_map_;
 };
 
 }  // namespace singa


[05/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

Update install.sh to make OpenBLAS with ONLY_CBLAS=1, which removes the
dependency on gfortran. Thanks to ZhangHao for reporting this solution.

Corret the configuration for rnnlm example (replacing test with
validation).

Add the license of cpplint.py in LICENSE file.

Add the solution of openblas installation error into README.md.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ab984da8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ab984da8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ab984da8

Branch: refs/heads/master
Commit: ab984da88cb6809ced489e0a2377e50c1856827d
Parents: 243f210
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sat Sep 26 23:16:15 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:27:46 2015 +0800

----------------------------------------------------------------------
 LICENSE                 |  13 ++--
 README.md               |  43 +++++++-----
 RELEASE_NOTES           |   3 +-
 examples/rnnlm/job.conf |   8 +--
 src/test/test_common.cc |  21 ++++++
 thirdparty/install.sh   | 156 +++++++++++++++++++++----------------------
 6 files changed, 139 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index 64f06dd..7e79eef 100644
--- a/LICENSE
+++ b/LICENSE
@@ -206,8 +206,8 @@ subcomponents is subject to the terms and conditions of the following
 licenses.
 
 ============================================================================
-SINGA bundles the following under BSD 2-clause license: include/utils/blob.h, src/utils/blob.cc,
-include/utils/common.h, src/utils/common.cc
+SINGA bundles the following under BSD 2-clause license: include/utils/blob.h,
+src/utils/blob.cc, include/utils/common.h, src/utils/common.cc
 
 Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
 https://github.com/BVLC/caffe/blob/master/LICENSE
@@ -216,7 +216,7 @@ https://github.com/BVLC/caffe/blob/master/LICENSE
 SINGA bundles the following under BSD 2-clause license: include/utils/tinydir.h
 
 Copyright (c) 2013, Cong Xu
-https://github.com/cxong/tinydir/blob/master/COPYING 
+https://github.com/cxong/tinydir/blob/master/COPYING
 
 =====================================================================
 SINGA bundles the following under Apache v2.0 license: include/mshadow/*
@@ -225,10 +225,13 @@ Copyright (c) 2014 by Contributors
 https://github.com/dmlc/mshadow/blob/master/LICENSE
 
 =====================================================================
-SINGA bundles the following under New BSD license: include/mshadow/*
+SINGA bundles the following under New BSD license: include/gtest/*
 
 Copyright 2008, Google Inc.
 https://code.google.com/p/googletest/source/browse/trunk/LICENSE
-=====================================================================
 
+=====================================================================
+SINGA bundles the following under New BSD license: tool/cppling.py
 
+Copyright (c) 2009 Google Inc. All rights reserved.
+https://github.com/google/styleguide/tree/gh-pages/cpplint

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index af903a9..bf7874a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-
 #Apache SINGA
 
 Distributed deep learning system
@@ -22,13 +21,20 @@ The current code depends on the following external libraries:
   * czmq (Mozilla Public License Version 2.0)
   * zookeeper (Apache 2.0)
 
-To install openblas, you need a fortran compiler.
+You can install all dependencies into $PREFIX folder by
+
+    ./thirdparty/install.sh all $PREFIX
+
+If $PREFIX is not a system path (e.g., /usr/local/), you have to export some
+environment variables,
+
+    export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
+    export CPLUS_INCLUDE_PATH=$PREFIX/include
 
 ##Documentation
 
 Full documentation is available online at [Official Documentation](https://singa.incubator.apache.org/docs/overview.html#).
 
-
 ##Building SINGA
 
     $ ./autogen.sh (optional)
@@ -38,8 +44,7 @@ Full documentation is available online at [Official Documentation](https://singa
 
 ##Running Examples
 
-Let us train the [CNN
-model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) over the
+Let us train the [CNN model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) over the
 [CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) dataset without parallelism as an example. The hyper-parameters
 are set following [cuda-convnet](https://code.google.com/p/cuda-convnet/). More details about this example are available
 at [CNN example](http://singa.incubator.apache.org/docs/cnn).
@@ -54,7 +59,7 @@ First, download the dataset and create data shards:
 If it reports errors due to libopenblas.so missing, then include the
 lib folder of OpenBLAS in LD_LIBRARY_PATH
 
-    $ export LD_LIBRARY_PATH= OPENBLAS_FOLDER/lib:$LD_LIBRARY_PATH
+    $ export LD_LIBRARY_PATH=$OPENBLAS_FOLDER/lib:$LD_LIBRARY_PATH
     # delete the newly created folders
     $ rm -rf cifar10_t*
     $ make create
@@ -69,7 +74,7 @@ Now we just need to wait until it is done!
 
 ##LICENSE
 
-Apache Singa is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+Apache SINGA is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
 
 For additional information, see the LICENSE and NOTICE files.
 
@@ -80,20 +85,20 @@ run `install.sh OpenBLAS`.
 
   A1: `OpenBLAS` library is installed in `/opt` folder by default or
   other folders if you use `sudo apt-get install`.
-  You need to include the OpenBLAS library folder in the LDFLAGS.
+  You need to include the OpenBLAS library folder in the LDFLAGS, e.g.,
 
-      $ export LDFLAGS=-L/opt/OpenBLAS/lib
+      $ export LDFLAGS=-L/opt/OpenBLAS
 
   Alternatively, you can include the path in LIBRARY_PATH.
 
 
-* Q2: I get error `cblas.h not such file or directory exists`.
+* Q2: I get error `cblas.h no such file or directory exists`.
 
-  Q2: You need to include the folder of the cblas.h (e.g., /opt/OpenBLAS/include)
-  into CPLUS_INCLUDE_PATH
+  Q2: You need to include the folder of the cblas.h into CPLUS_INCLUDE_PATH,
+  e.g.,
 
       $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
-      # reconfigure and make
+      # reconfigure and make SINGA
       $ ./configure
       $ make
 
@@ -147,12 +152,16 @@ google.protobuf.internal when I try to import .py files.
 
 * Q8: When I build OpenBLAS from source, I am told that I need a fortran compiler.
 
-  A8: Since OpenBLAS use fortran compiler to build the library, you need a compiler with fortran support. As an alternative, you can build OpenBLAS from system tools. For example, if you have APT, just run:
-	 
+  A8: You can compile OpenBLAS by
+
+      $ make ONLY_CBLAS=1
+
+  or install it using
+
 	  $ sudo apt-get install openblas
 
-  or you can also run the following command if you have yum:
+  or
 
 	  $ sudo yum install openblas-devel
 
-  It is worth noting that you need root access to run the aforementioned commands.
+  It is worth noting that you need root access to run the last two commands.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/RELEASE_NOTES
----------------------------------------------------------------------
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 8c5eae0..b2dd00f 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,4 +1,4 @@
-Release Notes - SINGA - Version singa-incubating-0.1-rc1
+Release Notes - SINGA - Version singa-incubating-0.1.0-rc1
 -----------------------------------------
 
 SINGA is a general distributed deep learning platform for training big deep learning models over large datasets. It is
@@ -19,6 +19,7 @@ This release includes following features:
     * [SINGA-39] - Avoid ssh in scripts for single node environment
     * [SINGA-43] - Remove Job-related output from workspace
     * [SINGA-56] - No automatic launching of zookeeper service
+    * [SINGA-73] - Refine the selection of available hosts from host list
 
   * Installation with GNU Auto tool
     * [SINGA-4] - Refine thirdparty-dependency installation

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/examples/rnnlm/job.conf
----------------------------------------------------------------------
diff --git a/examples/rnnlm/job.conf b/examples/rnnlm/job.conf
index 5779ff6..db96e84 100644
--- a/examples/rnnlm/job.conf
+++ b/examples/rnnlm/job.conf
@@ -2,8 +2,8 @@ name: "rnnlm"
 #To scan the training file (81350) 10 times
 train_steps:81350
 #To scan the validation file (6828) once
-test_steps:683
-test_freq:8135
+valid_steps:683
+valid_freq:8135
 #disp_freq is specific to training
 disp_freq:8135
 train_one_batch {
@@ -36,14 +36,14 @@ layer {
     path: "examples/rnnlm/train_shard"
     max_window: 10
   }
-  exclude: kTest
+  exclude: kValidation
 }
 
 layer {
   name: "data"
   user_type: "kData"
   [data_conf] {
-    path: "examples/rnnlm/test_shard"
+    path: "examples/rnnlm/valid_shard"
     max_window: 10
   }
   exclude: kTrain

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/src/test/test_common.cc
----------------------------------------------------------------------
diff --git a/src/test/test_common.cc b/src/test/test_common.cc
index e30c9cb..03a02fb 100644
--- a/src/test/test_common.cc
+++ b/src/test/test_common.cc
@@ -1,3 +1,24 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
 #include <string>
 #include <unordered_map>
 #include <vector>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ab984da8/thirdparty/install.sh
----------------------------------------------------------------------
diff --git a/thirdparty/install.sh b/thirdparty/install.sh
index b9a3095..86211d6 100755
--- a/thirdparty/install.sh
+++ b/thirdparty/install.sh
@@ -64,7 +64,7 @@ function install_czmq()
 	fi
 	rm -rf czmq-3.0.0;
 	tar zxvf czmq-3.0.0-rc1.tar.gz && cd czmq-3.0.0;
-	
+
 	if [ $# == 2 ]
 	then
 		if [ $1 == "null" ]
@@ -109,7 +109,7 @@ function install_glog()
 	then
 		wget http://www.comp.nus.edu.sg/~dbsystem/singa/assets/file/thirdparty/glog-0.3.3.tar.gz;
 	fi
-	
+
 	rm -rf glog-0.3.3;
 	tar zxvf glog-0.3.3.tar.gz && cd glog-0.3.3;
 
@@ -178,7 +178,7 @@ function install_openblas()
 	rm -rf OpenBLAS-develop;
 	unzip OpenBLAS.zip && cd OpenBLAS-develop;
 
-	make;
+	make ONLY_CBLAS=1;
 	if [ $? -ne 0 ]
 	then
 		cd ..;
@@ -195,7 +195,7 @@ function install_openblas()
 			fi
 		elif [ $# == 0 ]
 		then
-			echo "install OpenBLAS in default path" 
+			echo "install OpenBLAS in default path"
 			sudo make install;
 			if [ $? -ne 0 ]
 			then
@@ -219,7 +219,7 @@ function install_opencv()
 
 	rm -rf opencv-2.4.10;
 	unzip opencv-2.4.10.zip && cd opencv-2.4.10;
-	
+
 	if [ $# == 1 ]
 		then
 			echo "install opencv in $1";
@@ -239,7 +239,7 @@ function install_opencv()
 		cd ..;
 		return -1;
 	fi
-	cd ..;	
+	cd ..;
 	return 0;
 }
 
@@ -307,7 +307,7 @@ function install_zeromq()
 		else
 			echo "wrong commands";
 	fi
-	
+
 	if [ $? -ne 0 ]
 	then
 		cd ..;
@@ -338,7 +338,7 @@ function install_zookeeper()
 			echo "install zookeeper in default path";
 			./configure;
 			make && sudo make install;
-		else 
+		else
 			echo "wrong commands";
 	fi
 
@@ -365,20 +365,20 @@ do
 #		echo "install cmake";
 #		if [[ $2 == */* ]];then
 #			install_cmake $2;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during cmake installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 #			shift
 #			shift
 #		else
 #			install_cmake;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during cmake installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 #			shift
 #		fi
 #		;;
@@ -392,7 +392,7 @@ do
 			else
 				install_czmq null $3;
 			fi
-			if [ $? -ne 0 ] 
+			if [ $? -ne 0 ]
 			then
 				echo "ERROR during czmq installation" ;
 				exit;
@@ -404,7 +404,7 @@ do
 		elif [ $3 == "-f" ]
 		then
 			install_czmq $2 $4;
-			if [ $? -ne 0 ] 
+			if [ $? -ne 0 ]
 			then
 				echo "ERROR during czmq installation" ;
 				exit;
@@ -416,7 +416,7 @@ do
 		elif [[ $2 == */* ]]
 		then
 			install_czmq $2;
-			if [ $? -ne 0 ] 
+			if [ $? -ne 0 ]
 			then
 				echo "ERROR during czmq installation" ;
 				exit;
@@ -425,11 +425,11 @@ do
 			shift
 		else
 			install_czmq null;
-			if [ $? -ne 0 ] 
+			if [ $? -ne 0 ]
 			then
 			    echo "ERROR during czmq installation" ;
 			    exit;
-			fi  
+			fi
 			shift
 		fi
 		;;
@@ -437,20 +437,20 @@ do
 		echo "install glog";
 		if [[ $2 == */* ]];then
 			install_glog $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during glog installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_glog;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during glog installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -458,20 +458,20 @@ do
 		echo "install lmdb";
 		if [[ $2 == */* ]];then
 			install_lmdb $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during lmdb installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_lmdb;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during lmdb installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -479,20 +479,20 @@ do
 		echo "install OpenBLAS";
 		if [[ $2 == */* ]];then
 			install_openblas $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during openblas installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_openblas;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during openblas installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -500,20 +500,20 @@ do
 #		echo "install opencv";
 #		if [[ $2 == */* ]];then
 #			install_opencv $2;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during opencv installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 #			shift
 #			shift
 #		else
 #			install_opencv;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during opencv installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 #			shift
 #		fi
 #		;;
@@ -521,20 +521,20 @@ do
 		echo "install protobuf";
 		if [[ $2 == */* ]];then
 			install_protobuf $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during protobuf installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_protobuf;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during protobuf installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -542,20 +542,20 @@ do
 		echo "install zeromq";
 		if [[ $2 == */* ]];then
 			install_zeromq $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zeromq installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_zeromq;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zeromq installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -563,20 +563,20 @@ do
 		echo "install zookeeper";
 		if [[ $2 == */* ]];then
 			install_zookeeper $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zookeeper installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 			shift
 		else
 			install_zookeeper;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zookeeper installation" ;
 		        exit;
-		    fi  
+		    fi
 			shift
 		fi
 		;;
@@ -584,53 +584,53 @@ do
 		echo "install all dependencies";
 		if [[ $2 == */* ]];then
 #			install_cmake $2;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during cmake installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 			install_zeromq $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zeromq installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_czmq $2 $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during czmq installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_glog $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during glog installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_lmdb $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during lmdb installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_openblas $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during openblas installation" ;
 		        exit;
-		    fi  
+		    fi
 #			install_opencv $2;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during opencv installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 			install_protobuf $2;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during protobuf installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_zookeeper $2;
 			if [ $? -ne 0 ]
 			then
@@ -641,53 +641,53 @@ do
 			shift
 		else
 #			install_cmake;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during cmake installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 			install_zeromq;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during zeromq installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_czmq null;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during czmq installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_glog;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during glog installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_lmdb;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during lmdb installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_openblas;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during openblas installation" ;
 		        exit;
-		    fi  
+		    fi
 #			install_opencv;
-#		    if [ $? -ne 0 ] 
+#		    if [ $? -ne 0 ]
 #		    then
 #		        echo "ERROR during opencv installation" ;
 #		        exit;
-#		    fi  
+#		    fi
 			install_protobuf;
-		    if [ $? -ne 0 ] 
+		    if [ $? -ne 0 ]
 		    then
 		        echo "ERROR during protobuf installation" ;
 		        exit;
-		    fi  
+		    fi
 			install_zookeeper;
 			if [ $? -ne 0 ]
 			then
@@ -712,5 +712,5 @@ do
 		echo " To install all dependencies, you can run:	"
 		echo "	./install.sh all"
 		exit;
-	esac	
+	esac
 done


[06/13] incubator-singa git commit: SINGA-70 Refactor API of Layer, Worker, Server and Driver

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
deleted file mode 100644
index ecfc94a..0000000
--- a/src/trainer/trainer.cc
+++ /dev/null
@@ -1,469 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "trainer/trainer.h"
-
-#include <glog/logging.h>
-#include <unistd.h>
-#include <map>
-#include <thread>
-#include "mshadow/tensor.h"
-#include "proto/common.pb.h"
-#include "utils/cluster.h"
-#include "utils/common.h"
-#include "utils/tinydir.h"
-
-namespace singa {
-
-using std::vector;
-using std::string;
-
-/***********************Trainer****************************/
-Trainer::~Trainer() {
-  delete router_;
-  for (NeuralNet* p : nets_)
-    delete p;
-}
-
-void Trainer::Start(bool resume, const SingaProto& singaConf, JobProto* job) {
-  // register job to zookeeper at the beginning
-  auto cluster = Cluster::Setup(job->id(), singaConf, job->cluster());
-  if (resume) Resume(job);
-  router_ = new Router();
-  router_->Bind(kInprocRouterEndpoint);
-  const string hostip = cluster->hostip();
-  int port = router_->Bind("tcp://" + hostip + ":*");
-  // register endpoint to zookeeper
-  cluster->Register(getpid(), hostip + ":" + std::to_string(port));
-  const vector<Worker*> workers = CreateWorkers(*job);
-  const vector<Server*> servers = CreateServers(*job);
-  SetupWorkerServer(*job, workers, servers);
-#ifdef USE_MPI
-  int nthreads = workers.size() + servers.size();
-  for (int i = 0; i < nthreads; i++)
-    MPIQueues.push_back(make_shared<SafeQueue>());
-#endif
-  vector<std::thread> threads;
-  for (auto server : servers)
-    threads.push_back(std::thread(&Server::Run, server));
-  for (auto worker : workers)
-    threads.push_back(std::thread(&Worker::Run, worker));
-  Run(workers, servers);
-  for (auto& thread : threads)
-    thread.join();
-  for (auto server : servers)
-    delete server;
-  for (auto worker : workers)
-    delete worker;
-}
-
-void Trainer::Resume(JobProto* jobConf) {
-  tinydir_dir dir;
-  string folder = Cluster::Get()->checkpoint_folder();
-  tinydir_open(&dir, folder.c_str());
-  int latest_step = 0;
-  // there would be multi checkpoint files (from diff workers) for one step
-  vector<string> ck_files;
-  // iterate all files to get the files for the last checkpoint
-  while (dir.has_next) {
-    tinydir_file file;
-    tinydir_readfile(&dir, &file);
-    tinydir_next(&dir);
-    char* ch = strstr(file.name, "step");
-    if (ch == nullptr) {
-      if (file.name[0] != '.')
-        LOG(INFO) << "Irregular file in checkpoint folder: " << file.name;
-      continue;
-    }
-    LOG(INFO) << "Add checkpoint file for resume: " << ch;
-    int step = atoi(ch+4);
-    if (step == latest_step) {
-      ck_files.push_back(file.name);
-    } else if (step > latest_step) {
-      latest_step = step;
-      ck_files.clear();
-      ck_files.push_back(string(file.name));
-    }
-  }
-  if (latest_step > 0) {
-    jobConf->set_step(latest_step);
-    if (!jobConf->has_reset_param_version())
-      jobConf->set_reset_param_version(false);
-    jobConf->clear_checkpoint_path();
-    for (auto ck_file : ck_files)
-      jobConf->add_checkpoint_path(folder + "/" + ck_file);
-  }
-  tinydir_close(&dir);
-}
-
-const vector<int> SliceParams(const vector<Param*>& params) {
-  // for load-balance among servers in a group and among server groups
-  int nserver_grps = Cluster::Get()->nserver_groups();
-  int nservers_per_grp = Cluster::Get()->nservers_per_group();
-  int lcm = LeastCommonMultiple(nserver_grps, nservers_per_grp);
-  // collect sizes of unique Params
-  std::vector<int> paramsize;
-  for (auto param : params)
-    if (param->id() == param->owner())
-      paramsize.push_back(param->size());
-  // slice into lcm pieces to achieve good load-balance for both intra-group
-  // partition (among servers in a group) and inter-group partition (each group
-  // is assgined a sub-set of slices)
-  auto param_slice = Slice(lcm, paramsize);
-  // construct map from Param ID to its slices <slice id, len>
-  std::unordered_map<int, vector<std::pair<int, int>>> paramid2slices;
-  vector<int> slices;
-  auto it = param_slice.begin();
-  int slice_id = 0;
-  for (auto param : params) {
-    if (param->id() == param->owner()) {
-      for (int len : *it) {
-        slices.push_back(len);
-        paramid2slices[param->id()].push_back(std::make_pair(slice_id++, len));
-      }
-      it++;
-    }
-  }
-  // add slice info for every Param
-  for (auto param : params)
-    for (auto entry : paramid2slices[param->owner()]) {
-      param->AddSlice(entry.first, entry.second);
-      LOG(INFO) << "param id " << param->id() << " owner=" << param->owner()
-        << ": " << entry.first << ", " << entry.second;
-    }
-  return slices;
-}
-
-void Trainer::SetupWorkerServer(const JobProto& job_conf,
-                                const vector<Worker*>& workers,
-                                const vector<Server*>& servers) {
-  auto cluster = Cluster::Get();
-  int grp_size = cluster->nworkers_per_group();
-  const auto& net_conf = job_conf.neuralnet();
-  auto net = NeuralNet::Create(net_conf, kTrain, grp_size);
-  nets_.push_back(net);
-  // MUST do SliceParam before share param/net with others
-  auto slices = SliceParams(net->params());
-  std::unordered_map<int, NeuralNet*> grp_net;
-  int first_grp = workers.size() ? workers.at(0)->grp_id() : -1;
-  for (auto worker : workers) {
-    int grp_id = worker->grp_id();
-    int worker_id = worker->id();
-    NeuralNet* test_net = nullptr;
-    NeuralNet* valid_net = nullptr;
-    if (grp_net.find(grp_id) == grp_net.end()) {
-      if (grp_id == first_grp) {
-        // test are performed only by the first group now.
-        // TODO(wangwei) update.
-        if (first_grp == 0 && job_conf.test_steps() && worker_id == 0) {
-          // hard code for exp
-          // TODO(wangwei) move test unit out as an independent module
-          test_net = NeuralNet::Create(net_conf, kTest, 1);
-          test_net->ShareParamsFrom(net);
-          nets_.push_back(test_net);
-        }
-        // validation are performed only by the first group.
-        // TODO(wangwei) update.
-        if (first_grp == 0 && job_conf.valid_steps() && worker_id == 0) {
-          valid_net = NeuralNet::Create(net_conf, kValidation, 1);
-          valid_net->ShareParamsFrom(net);
-          nets_.push_back(valid_net);
-        }
-        grp_net[grp_id] = net;
-      } else {
-        grp_net[grp_id] = NeuralNet::Create(net_conf, kTrain, grp_size);
-        nets_.push_back(grp_net[grp_id]);
-        if (cluster->share_memory())
-          grp_net[grp_id]->ShareParamsFrom(net);
-      }
-      for (auto layer : grp_net[grp_id]->layers()) {
-        bool local = layer->partition_id() >= workers.front()->id()
-          && layer->partition_id() <= workers.back()->id();
-        for (auto param : layer->GetParams()) {
-          int hash = Hash(grp_id, param->owner());
-          if (worker_shard_.find(hash) == worker_shard_.end())
-            worker_shard_[hash] = new ParamEntry();
-          worker_shard_[hash]->AddParam(local, param);
-        }
-      }
-    }
-    LOG(INFO) << "grp " << worker->grp_id() << ", worker "
-              << worker->id() << " net " << grp_net[grp_id];
-    worker->Setup(job_conf, grp_net[grp_id], valid_net, test_net);
-  }
-  //  partition among server groups, each group maintains one sub-set for sync
-  auto slice2group = PartitionSlices(cluster->nserver_groups(), slices);
-  //  partition within one server group, each server updates for one sub-set
-  slice2server_ = PartitionSlices(cluster->nservers_per_group(), slices);
-  for (auto server : servers)
-    server->Setup(job_conf.updater(), slice2group, slice2server_);
-}
-
-vector<Server*> Trainer::CreateServers(const JobProto& job) {
-  auto cluster = Cluster::Get();
-  vector<Server*> servers;
-  if (!cluster->has_server())
-    return servers;
-  int server_procs = cluster->procs_id();
-  // if true, server procs (logical) id starts after worker procs
-  if (cluster->server_worker_separate())
-    server_procs -= cluster->nworker_procs();
-  const vector<int> rng = cluster->ExecutorRng(server_procs,
-                                               cluster->nservers_per_group(),
-                                               cluster->nservers_per_procs());
-  int gstart = rng[0], gend = rng[1], start = rng[2], end = rng[3];
-  for (int gid = gstart; gid < gend; gid++) {
-    for (int sid = start; sid < end; sid++) {
-      auto server = new Server(gid, sid);
-      servers.push_back(server);
-    }
-  }
-  return servers;
-}
-
-vector<Worker*> Trainer::CreateWorkers(const JobProto& job) {
-  auto cluster = Cluster::Get();
-  vector<Worker*> workers;
-  if (!cluster->has_worker())
-    return workers;
-  const vector<int> rng = cluster->ExecutorRng(cluster->procs_id(),
-                                               cluster->nworkers_per_group(),
-                                               cluster->nworkers_per_procs());
-  int gstart = rng[0], gend = rng[1], wstart = rng[2], wend = rng[3];
-  for (int gid = gstart; gid < gend; gid++) {
-    for (int wid = wstart; wid < wend; wid++) {
-      auto *worker = Worker::Create(job);
-      worker->Init(gid, wid);
-      workers.push_back(worker);
-    }
-  }
-  return workers;
-}
-
-void Trainer::Run(const vector<Worker*>& workers,
-                  const vector<Server*>& servers) {
-  int nworkers = workers.size(), nservers = servers.size();
-  auto cluster = Cluster::Get();
-  procs_id_ = cluster->procs_id();
-  LOG(INFO) << "Stub in process " << procs_id_ << " starts";
-  std::map<int, Dealer*> inter_dealers;  // for sending msg to other procs
-  std::queue<Msg*> msg_queue;
-  while (true) {
-    Msg* msg = nullptr;
-    if (msg_queue.empty()) {
-      msg = router_->Receive();
-    } else {
-      msg = msg_queue.front();
-      msg_queue.pop();
-    }
-    int type = msg->type(), dst = msg->dst(), flag = AddrType(dst);
-    if (flag == kStub && (AddrProc(dst) == procs_id_ || AddrGrp(dst) == -1)) {
-      //  the following statements are ordered!
-      if (type == kConnect) {
-        DeleteMsg(&msg);
-      } else if (type == kMetric) {
-        DisplayMetric(&msg);
-      } else if (type == kStop) {
-        int src_flag = AddrType(msg->src());
-        if (src_flag == kServer) nservers--;
-        else if (src_flag == kWorkerParam) nworkers--;
-        DeleteMsg(&msg);
-        if (nworkers == 0 && nservers == 0) break;
-      } else {
-        HandleLocalMsg(&msg_queue, &msg);
-      }
-    } else {
-      int dst_procs = AddrProc(dst);
-      if (flag != kStub)
-        dst_procs = cluster->ProcsIDOf(AddrGrp(dst), AddrID(dst), flag);
-      if (dst_procs != procs_id_) {
-        if (inter_dealers.find(dst_procs) == inter_dealers.end())
-          inter_dealers[dst_procs] = CreateInterProcsDealer(dst_procs);
-        inter_dealers[dst_procs]->Send(&msg);
-      } else {
-        router_->Send(&msg);
-      }
-    }
-  }
-  LOG(ERROR) << "Stub in process " << procs_id_ << " stops";
-  for (auto& entry : inter_dealers)
-    delete entry.second;
-}
-
-void Trainer::DisplayMetric(Msg** msg) {
-  Msg* msgg = *msg;
-  // only display metrics from the first group
-  if (AddrGrp(msgg->src()) == 0) {
-    int step = msgg->trgt_version();
-    char prefix[128];
-    msgg->ParseFormatFrame("s", prefix);
-    CHECK(msgg->NextFrame());
-    const string perf(static_cast<char*>(msgg->FrameData()), msgg->FrameSize());
-    Metric cur(perf);
-    LOG(ERROR) << prefix << " step-" << step <<", " << cur.ToLogString();
-  }
-  DeleteMsg(msg);
-}
-
-Dealer* Trainer::CreateInterProcsDealer(int dst_procs) {
-  // forward to other procs
-  auto cluster = Cluster::Get();
-  auto dealer = new Dealer();
-  while (cluster->endpoint(dst_procs) == "") {
-    // kCollectSleepTime));
-    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
-    LOG(ERROR) << "waiting for procs " << dst_procs << " to register";
-  }
-  dealer->Connect("tcp://"+cluster->endpoint(dst_procs));
-  return dealer;
-}
-
-void Trainer::HandleLocalMsg(std::queue<Msg*>* msg_queue, Msg** msg) {
-  Msg* msgg = *msg;
-  int paramid = ParamID(msgg->trgt_val());
-  int type = msgg->type();
-  int grp;
-  ParamEntry *entry = nullptr;
-  // TODO(wangwei) process other requests, e.g. RESTful
-  switch (type) {
-    case kUpdate:
-      grp = AddrGrp(msgg->src());
-      entry = worker_shard_.at(Hash(grp, paramid));
-      for (auto update_msg : HandleUpdate(entry, msg))
-        msg_queue->push(update_msg);
-      break;
-    case kRUpdate:
-      grp = AddrGrp(msgg->dst());
-      entry = worker_shard_.at(Hash(grp, paramid));
-      HandleUpdateResponse(entry, msg);
-      break;
-    case kGet:
-      grp = AddrGrp(msgg->src());
-      entry = worker_shard_.at(Hash(grp, paramid));
-      for (auto get_msg : HandleGet(entry, msg))
-        msg_queue->push(get_msg);
-      break;
-    case kRGet:
-      grp = AddrGrp(msgg->dst());
-      entry = worker_shard_.at(Hash(grp, paramid));
-      HandleGetResponse(entry, msg);
-      break;
-    case kPut:
-      grp = AddrGrp(msgg->src());
-      entry = worker_shard_.at(Hash(grp, paramid));
-      for (auto put_msg : HandlePut(entry, msg))
-        msg_queue->push(put_msg);
-      break;
-    default:
-      LOG(ERROR) << "Unknow message type:" << type;
-      break;
-  }
-}
-
-void Trainer::GenMsgs(int type, int version, ParamEntry* entry, Msg* msg,
-                      vector<Msg*> *ret) {
-  int src_grp = AddrGrp(msg->src());
-  int dst_grp = src_grp / Cluster::Get()->nworker_groups_per_server_group();
-  auto param = entry->shares.at(0);
-  for (int idx = 0 ; idx < param->num_slices(); idx++) {
-    int slice_id = param->slice_start() + idx;
-    int server = slice2server_[slice_id];
-    int dst_procs = Cluster::Get()->ProcsIDOf(dst_grp, server, kServer);
-    Msg* new_msg = nullptr;
-    if (type == kPut) {
-      CHECK_GT(entry->num_total, 0);
-      new_msg = param->GenPutMsg(dst_procs != procs_id_, idx);
-      new_msg->AddFormatFrame("i", entry->num_total);
-    } else if (type == kGet) {
-      new_msg = param->GenGetMsg(dst_procs != procs_id_, idx);
-    } else if (type == kUpdate) {
-      new_msg = param->GenUpdateMsg(dst_procs != procs_id_, idx);
-      new_msg->AddFormatFrame("i", entry->num_local);
-    } else {
-      LOG(FATAL) << "Wrong type";
-    }
-    new_msg->set_trgt(ParamTrgt(param->owner(), slice_id), version);
-    new_msg->set_src(Addr(src_grp, procs_id_, kStub));
-    new_msg->set_dst(Addr(dst_grp, server, kServer));
-    ret->push_back(new_msg);
-  }
-}
-
-const vector<Msg*> Trainer::HandleGet(ParamEntry* entry, Msg** msg) {
-  vector<Msg*> ret;
-  int version = (*msg)->trgt_version();
-  if (version > entry->next_version) {
-    entry->next_version = version;
-    GenMsgs(kGet, version, entry, *msg, &ret);
-  }
-  DeleteMsg(msg);
-  return ret;
-}
-
-const vector<Msg*> Trainer::HandleUpdate(ParamEntry *entry, Msg** msg) {
-  vector<Msg*> ret;
-  entry->num_update++;
-  if (entry->num_update >= entry->num_local) {
-    // average local gradient
-    if (entry->num_local > 1) {
-      auto it = entry->shares.begin();
-      auto shape = mshadow::Shape1((*it)->size());
-      mshadow::Tensor<mshadow::cpu, 1> sum((*it)->mutable_cpu_grad(), shape);
-      for (++it; it != entry->shares.end(); it++) {
-        mshadow::Tensor<mshadow::cpu, 1> grad((*it)->mutable_cpu_grad(), shape);
-        sum += grad;
-      }
-    }
-    int step = (*msg)->trgt_version();
-    GenMsgs(kUpdate, step, entry, *msg, &ret);
-    entry->num_update = 0;
-  }
-  DeleteMsg(msg);
-  return ret;
-}
-
-const vector<Msg*> Trainer::HandlePut(ParamEntry* entry, Msg** msg) {
-  vector<Msg*> ret;
-  int version = (*msg)->trgt_version();
-  GenMsgs(kPut, version, entry, *msg, &ret);
-  DeleteMsg(msg);
-  return ret;
-}
-
-void Trainer::HandleGetResponse(ParamEntry* entry, Msg** msg) {
-  int version = (*msg)->trgt_version();
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto param = entry->shares.at(0);
-  if (param->ParseGetResponseMsg(*msg, sliceid-param->slice_start()))
-    param->set_version(version);
-  DeleteMsg(msg);
-}
-
-void Trainer::HandleUpdateResponse(ParamEntry* entry, Msg** msg) {
-  int version = (*msg)->trgt_version();
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto param = entry->shares.at(0);
-  if (param->ParseUpdateResponseMsg(*msg, sliceid-param->slice_start()))
-    param->set_version(version);
-  DeleteMsg(msg);
-}
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
deleted file mode 100644
index 70859de..0000000
--- a/src/trainer/worker.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "trainer/worker.h"
-
-#include <glog/logging.h>
-#include <chrono>
-#include <thread>
-#include <typeinfo>
-#include "utils/cluster.h"
-#include "utils/factory.h"
-#include "utils/singleton.h"
-
-namespace singa {
-
-using std::string;
-
-Worker* Worker::Create(const JobProto& proto) {
-  auto factory = Singleton<Factory<singa::Worker>>::Instance();
-  Worker* worker = nullptr;
-  const auto& conf = proto.train_one_batch();
-  if (conf.has_user_alg())
-    worker = factory->Create(conf.user_alg());
-  else
-    worker = factory->Create(conf.alg());
-  return worker;
-}
-
-void Worker::Init(int grp_id, int id) {
-  grp_id_ = grp_id;
-  id_ = id;
-  layer_dealer_ = dealer_ = nullptr;
-}
-
-Worker::~Worker() {
-  if (layer_dealer_)
-    delete layer_dealer_;
-  if (dealer_)
-    delete dealer_;
-}
-
-void Worker::Setup(const JobProto& job, NeuralNet* train_net,
-                   NeuralNet* valid_net, NeuralNet* test_net) {
-  job_conf_.CopyFrom(job);
-  train_net_ = train_net;
-  validation_net_ = valid_net;
-  test_net_ = test_net;
-}
-
-void Worker::InitLocalParams() {
-  // for each server grp, its first subscriber worker grp does the param init
-  if (grp_id_ % Cluster::Get()->nworker_groups_per_server_group() == 0) {
-    // extract params that should be initialized by this worker
-    // must gen a name for each param if the user doesn't config it
-    std::unordered_map<string, Param*> name2param;
-    for (auto layer : train_net_->layers()) {
-      if (layer->partition_id() == id_) {
-        for (auto param : layer->GetParams()) {
-          // only owners fill the memory of parameter values.
-          if (param->owner() == param->id()) {
-            CHECK(name2param.find(param->name()) == name2param.end());
-            name2param[param->name()] = param;
-          }
-        }
-      }
-    }
-    // load from checkpoints. get param blob based on param name.
-    // the param from previous checkpoint files will be overwritten by
-    // the param with the same name in later checkpoint files.
-    for (const auto checkpoint : job_conf_.checkpoint_path()) {
-      LOG(ERROR) << "Load from checkpoint file " << checkpoint;
-      BlobProtos bps;
-      ReadProtoFromBinaryFile(checkpoint.c_str(), &bps);
-      for (int i = 0; i < bps.name_size(); i++) {
-        if (name2param.find(bps.name(i)) != name2param.end()) {
-          name2param.at(bps.name(i))->FromProto(bps.blob(i));
-          //  if load from pre-training params, reset version to start step
-          if (job_conf_.reset_param_version())
-            name2param.at(bps.name(i))->set_version(job_conf_.step());
-          else  // if resume training, use the same version as last checkpoint
-            name2param.at(bps.name(i))->set_version(bps.version(i));
-        }
-      }
-    }
-    // init other params who do not have checkpoint version
-    for (auto entry : name2param)
-      if (entry.second->version() < 0) {
-        entry.second->InitValues(job_conf_.step());
-        if (!job_conf_.reset_param_version())
-          LOG(ERROR) << "better reset version of params from checkpoints "
-            << "to the same as other newly initialized params!";
-      }
-
-    Metric perf;
-    // warmup training before put params to servers
-    for (; step_ < job_conf_.warmup_steps(); step_++)
-      TrainOneBatch(step_, &perf);
-    for (auto layer : train_net_->layers()) {
-      if (layer->partition_id() == id_)
-        for (auto param : layer->GetParams())
-          if (param->owner() == param->id())
-            Put(param, param->version());
-    }
-  }
-  // wait owners in the same procs init params, then no get requests sent
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  for (auto layer : train_net_->layers()) {
-    if (layer->partition_id() == id_)
-      for (auto param : layer->GetParams())
-        Get(param, job_conf_.warmup_steps());
-  }
-}
-
-void ConnectStub(int grp, int id, Dealer* dealer, EntityType entity) {
-  dealer->Connect(kInprocRouterEndpoint);
-  Msg* ping = new Msg(Addr(grp, id, entity), Addr(-1, -1, kStub));
-  ping->set_type(kConnect);
-  dealer->Send(&ping);
-}
-
-void Worker::Run() {
-  LOG(ERROR) << "Worker (group = " << grp_id_ <<", id = " << id_ << ") start";
-  auto cluster = Cluster::Get();
-  int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group();
-  CHECK(cluster->runtime()->JoinSGroup(grp_id_, id_, svr_grp));
-  // TODO(wangsh): provide a unique sock id from cluster
-  dealer_ = new Dealer(0);
-  ConnectStub(grp_id_, id_, dealer_, kWorkerParam);
-  for (auto layer : train_net_->layers()) {
-    if (layer->partition_id() == id_) {
-      if (typeid(layer) == typeid(BridgeDstLayer)
-          || typeid(layer) == typeid(BridgeSrcLayer)) {
-        // TODO(wangsh): provide a unique socket id from cluster
-        layer_dealer_ = new Dealer(1);
-        ConnectStub(grp_id_, id_, layer_dealer_, kWorkerLayer);
-        break;
-      }
-    }
-  }
-
-  step_ = job_conf_.step();
-  InitLocalParams();
-  Metric perf;
-  while (!StopNow(step_)) {
-    if (ValidateNow(step_) && validation_net_ != nullptr) {
-      // LOG(ERROR)<<"Validation at step "<<step;
-      CollectAll(validation_net_, step_);
-      Test(job_conf_.valid_steps(), kValidation, validation_net_);
-    }
-    if (TestNow(step_) && test_net_ != nullptr) {
-      // LOG(ERROR)<<"Test at step "<<step;
-      CollectAll(test_net_, step_);
-      Test(job_conf_.test_steps(), kTest, test_net_);
-    }
-    if (CheckpointNow(step_)) {
-      CollectAll(train_net_, step_);
-      Checkpoint(step_, train_net_);
-      job_conf_.set_step(step_);
-    }
-    TrainOneBatch(step_, &perf);
-    // LOG(ERROR) << "Train " << step_;
-    if (DisplayNow(step_)) {
-      Report("Train", perf);
-      perf.Reset();
-    }
-    step_++;
-  }
-
-  // save the model
-  Checkpoint(step_, train_net_);
-  // clean up
-  cluster->runtime()->LeaveSGroup(grp_id_, id_, svr_grp);
-  // notify the stub on worker stop
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_type(kStop);
-  dealer_->Send(&msg);  // use param dealer to send the stop msg
-  LOG(ERROR) << "Worker (group = " <<grp_id_ << ", id = " << id_ << ") stops";
-}
-
-void Worker::Checkpoint(int step, NeuralNet* net) {
-  if (grp_id_ == 0) {
-    BlobProtos bps;
-    for (auto layer : net->layers()) {
-      if (layer->partition_id() == id_) {
-        for (auto param : layer->GetParams()) {
-          // only owners fill the memory of parameter values.
-          if (param->owner() == param->id()) {
-            auto *blob = bps.add_blob();
-            param->ToProto(blob);
-            bps.add_version(param->version());
-            bps.add_name(param->name());
-          }
-        }
-      }
-    }
-    char buf[256];
-    snprintf(buf, sizeof(buf), "%s/step%d-worker%d.bin",
-             Cluster::Get()->checkpoint_folder().c_str(), step, id_);
-    LOG(INFO) << "checkpoint to " << buf;
-    WriteProtoToBinaryFile(bps, buf);
-  }
-}
-
-int Worker::Put(Param* param, int step) {
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kPut);
-  dealer_->Send(&msg);
-  return 1;
-}
-
-int Worker::Get(Param* param, int step) {
-  if (param->version() >= step)
-    return 1;
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kGet);
-  dealer_->Send(&msg);
-  return 1;
-}
-
-int Worker::Update(Param* param, int step) {
-  param->set_local_version(param->version());
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kUpdate);
-  dealer_->Send(&msg);
-  return 1;
-}
-
-int Worker::CollectAll(NeuralNet* net, int step) {
-  auto& layers = net->layers();
-  for (auto& layer : layers) {
-    if (layer->partition_id() == id_) {
-      for (Param* p : layer->GetParams()) {
-        Collect(p, step);
-      }
-    }
-  }
-  return 1;
-}
-
-int Worker::Collect(Param* param, int step) {
-  while (param->version() <= param->local_version())
-    std::this_thread::sleep_for(std::chrono::milliseconds(kCollectSleepTime));
-  return 1;
-}
-
-void Worker::Report(const string& prefix, const Metric & perf) {
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(0, step_);
-  msg->set_type(kMetric);
-  const string disp = perf.ToString();
-  msg->AddFormatFrame("s", prefix.c_str());
-  msg->AddFrame(disp.c_str(), disp.length());
-  dealer_->Send(&msg);
-}
-
-void Worker::ReceiveBlobs(bool data, bool grad, BridgeLayer* layer,
-                          NeuralNet* net) {
-  while (!layer->ready()) {
-    auto msg = layer_dealer_->Receive();
-    CHECK_EQ(AddrGrp(msg->src()), grp_id_);
-    string name(static_cast<char*>(msg->FrameData()), msg->FrameSize());
-    auto receive_layer = net->name2layer(name);
-    auto data = receive_layer->mutable_data(nullptr);
-    msg->NextFrame();
-    memcpy(data->mutable_cpu_data(), msg->FrameData(), msg->FrameSize());
-    dynamic_cast<BridgeLayer*>(receive_layer)->set_ready(true);
-    delete msg;
-  }
-}
-
-void Worker::SendBlobs(bool data, bool grad, BridgeLayer* layer,
-                       NeuralNet* net) {
-  auto dst = layer->dstlayers().at(0);
-  Msg *msg = new Msg();
-  msg->set_src(Addr(grp_id_, id_, kWorkerLayer));
-  msg->set_dst(Addr(grp_id_, dst->partition_id(), kWorkerLayer));
-  msg->AddFrame(dst->name().c_str(), dst->name().length());
-  auto const & blob = layer->data(nullptr);
-  msg->AddFrame(blob.cpu_data(), blob.count() * sizeof(float));
-  layer_dealer_->Send(&msg);
-}
-
-void Worker::Test(int nsteps, Phase phase, NeuralNet* net) {
-  Metric perf;
-  for (int step = 0; step < nsteps; step++)
-    TestOneBatch(step, phase, net, &perf);
-  if (phase == kValidation)
-    Report("Validation", perf);
-  else if (phase == kTest)
-    Report("Test", perf);
-}
-
-/****************************BPWorker**********************************/
-void BPWorker::TrainOneBatch(int step, Metric* perf) {
-  Forward(step, kTrain, train_net_, perf);
-  Backward(step, train_net_);
-}
-
-void BPWorker::TestOneBatch(int step, Phase phase, NeuralNet* net,
-                            Metric* perf) {
-  Forward(step, phase, net, perf);
-}
-
-void BPWorker::Forward(int step, Phase phase, NeuralNet* net, Metric* perf) {
-  for (auto& layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      // TODO(wangwei): enable this for model partition
-      // recv data from other workers
-      // if (typeid(*layer) == typeid(BridgeDstLayer))
-      //   ReceiveBlobs(true, false, dynamic_cast<BridgeLayer*>(layer), net);
-      if (phase == kTrain) {
-        // wait until param is updated
-        for (Param* p : layer->GetParams()) {
-          Collect(p, step);
-        }
-      }
-      layer->ComputeFeature(phase | kForward, perf);
-      // TODO(wangwei): enable this for model partition
-      // send data to other workers
-      // if (typeid(*layer) == typeid(BridgeSrcLayer))
-      //   SendBlobs(true, false, dynamic_cast<BridgeLayer*>(layer), net);
-      if (DisplayDebugInfo(step))
-        LOG(INFO) << layer->DebugString(step, phase | kForward);
-    }
-  }
-}
-
-void BPWorker::Backward(int step, NeuralNet* net) {
-  auto& layers = net->layers();
-  for (auto it = layers.rbegin(); it != layers.rend(); it++) {
-    Layer* layer = *it;
-    if (layer->partition_id() == id_) {
-      // TODO(wangwei): enable this for model partition
-      // send data to other workers
-      // if (typeid(layer) == typeid(BridgeSrcLayer))
-      //   ReceiveBlobs(false, true, layer, net);
-      layer->ComputeGradient(kTrain | kBackward, nullptr);
-      if (DisplayDebugInfo(step))
-        LOG(INFO) << layer->DebugString(step, kTrain | kBackward);
-      for (Param* p : layer->GetParams())
-        Update(p, step);
-      // TODO(wangwei): enable this for model partition
-      // recv data from other workers
-      // if (typeid(layer) == typeid(BridgeDstLayer))
-      //   SendBlobs(false, true, dynamic_cast<BridgeDstLayer*>(layer), net);
-    }
-  }
-}
-
-/****************************CDWorker**********************************/
-void CDWorker::TrainOneBatch(int step, Metric* perf) {
-  const auto& layers = train_net_->layers();
-  for (auto* layer : layers) {
-    for (Param* p : layer->GetParams())  // wait until param is updated
-      Collect(p, step);
-    layer->ComputeFeature(kPositive, perf);
-  }
-  for (auto* layer : layers)
-    if (typeid(*layer) == typeid(RBMVisLayer)
-          || typeid(*layer) == typeid(RBMHidLayer))
-      layer->ComputeFeature(kNegative | kTest, perf);
-  for (int i = 1; i < job_conf_.train_one_batch().cd_conf().cd_k(); i++) {
-    for (auto* layer : layers) {
-      if (typeid(*layer) == typeid(RBMVisLayer)
-          || typeid(*layer) == typeid(RBMHidLayer))
-      layer->ComputeFeature(kNegative, perf);
-    }
-  }
-  for (auto* layer : layers) {
-    if (typeid(*layer) == typeid(RBMVisLayer)
-        || typeid(*layer) == typeid(RBMHidLayer)) {
-      layer->ComputeGradient(kTrain, nullptr);
-      for (Param* p : layer->GetParams()) {
-        Update(p, step);
-      }
-    }
-  }
-}
-
-void CDWorker::TestOneBatch(int step, Phase phase, NeuralNet* net,
-                            Metric* perf) {
-  auto& layers = net->layers();
-  for (auto *layer : layers)
-    layer->ComputeFeature(kPositive, perf);
-  for (auto *layer : layers)
-    if (typeid(*layer) == typeid(RBMVisLayer))
-      layer->ComputeFeature(kNegative | kTest, perf);
-}
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/utils/cluster.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc
index 3b09417..c3cdc62 100644
--- a/src/utils/cluster.cc
+++ b/src/utils/cluster.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -38,7 +38,7 @@ Cluster* Cluster::Setup(int job, const SingaProto& singaConf,
 Cluster* Cluster::Get() {
   if (!Singleton<Cluster>::Instance()->nprocs_) {
     LOG(ERROR) << "The first call to Get should "
-               << "provide the sys/model conf path";
+               << "provide the job conf path";
   }
   return Singleton<Cluster>::Instance();
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/utils/common.cc
----------------------------------------------------------------------
diff --git a/src/utils/common.cc b/src/utils/common.cc
index 65b2ec2..13f2552 100644
--- a/src/utils/common.cc
+++ b/src/utils/common.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -244,7 +244,7 @@ string GetHostIP() {
   close(fd);
   string ip(inet_ntoa(((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr));
   /* display result */
-  LOG(INFO) << "Host IP=(" << ip;
+  LOG(INFO) << "Host IP= " << ip;
   return ip;
 }
 
@@ -290,7 +290,7 @@ string Metric::ToLogString() const {
   string ret;
   size_t k = 0;
   for (auto e : entry_) {
-    ret += e.first + " : ";
+    ret += e.first + " = ";
     ret += std::to_string(e.second.second / e.second.first);
     if (++k < entry_.size())
       ret += ", ";

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 07c238c..1ee4dcd 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,9 +24,11 @@
 #include <glog/logging.h>
 #include <cmath>
 #include <random>
+#include <unordered_map>
 #include "mshadow/tensor.h"
 #include "utils/factory.h"
 #include "utils/singleton.h"
+#include "utils/common.h"
 
 namespace singa {
 
@@ -93,6 +95,7 @@ void UniformSqrtFanInOutGen::Fill(Blob<float>* blob) {
   data /= sqrt(blob->shape()[0] + blob->shape()[1]);
 }
 
+/****************** Param functions *********************************/
 Param* Param::Create(const ParamProto& proto) {
   Factory<Param>* factory = Singleton<Factory<Param>>::Instance();
   Param* p = nullptr;
@@ -104,6 +107,49 @@ Param* Param::Create(const ParamProto& proto) {
   return p;
 }
 
+const vector<int> Param::ComputeSlices(int num, const vector<Param*>& params) {
+  // collect sizes of unique Params
+  std::vector<int> paramsize;
+  for (auto param : params)
+    if (param->id() == param->owner())
+      paramsize.push_back(param->size());
+  // slice into lcm pieces to achieve good load-balance for both intra-group
+  // partition (among servers in a group) and inter-group partition (each group
+  // is assgined a sub-set of slices)
+  auto param_slice = Slice(num, paramsize);
+  vector<int> slices;
+  for (auto const vec: param_slice)
+    for (int len : vec)
+      slices.push_back(len);
+  return slices;
+}
+
+void Param::SliceParams(int num, const vector<Param*>& params) {
+  auto slices = ComputeSlices(num, params);
+  // construct map from Param ID to its slices <slice id, len>
+  std::unordered_map<int, vector<std::pair<int, int>>> paramid2slices;
+  int slice_id = 0;
+  auto it = slices.begin();
+  for (auto param : params) {
+    if (param->id() == param->owner()) {
+      int len = 0;
+      while (len < param->size() && it != slices.end()) {
+        paramid2slices[param->id()].push_back(std::make_pair(slice_id++, *it));
+        len += *it;
+        it++;
+      }
+      CHECK_EQ(param->size(), len) << "length misamtch for ID=" << param->id();
+    }
+  }
+  for (auto param : params) {
+    for (auto entry : paramid2slices[param->owner()]) {
+      param->AddSlice(entry.first, entry.second);
+      LOG(INFO) << "param id " << param->id() << " owner=" << param->owner()
+        << ", slice id = " << entry.first << ", size = " << entry.second;
+    }
+  }
+}
+
 void Param::Setup(const vector<int>& shape) {
   data_ = std::make_shared<Blob<float>>(shape);
   grad_.Reshape(shape);
@@ -329,14 +375,14 @@ Msg* Param::HandleSyncMsg(Msg** msg, bool reserve) {
 }
 
 int Param::ParseGetResponseMsg(Msg *msg, int slice_idx) {
-  CHECK_EQ(pending_get_[slice_idx], true);
+  CHECK(pending_get_[slice_idx]) << slice_idx;
   pending_get_[slice_idx] = false;
   ParseResponseMsg(msg, slice_idx);
   return (--num_pending_requests_) % num_slices_ == 0;
 }
 
 int Param::ParseUpdateResponseMsg(Msg *msg, int slice_idx) {
-  CHECK_EQ(pending_update_[slice_idx], true);
+  CHECK(pending_update_[slice_idx]) << id() << " " << slice_idx;
   pending_update_[slice_idx] = false;
   ParseResponseMsg(msg, slice_idx);
   return (--num_pending_requests_) % num_slices_ == 0;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/worker.cc
----------------------------------------------------------------------
diff --git a/src/worker.cc b/src/worker.cc
new file mode 100644
index 0000000..153e1a1
--- /dev/null
+++ b/src/worker.cc
@@ -0,0 +1,410 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "./worker.h"
+
+#include <glog/logging.h>
+#include <chrono>
+#include <thread>
+#include <typeinfo>
+#include "utils/cluster.h"
+#include "utils/factory.h"
+#include "utils/singleton.h"
+
+namespace singa {
+
+using std::string;
+
+Worker* Worker::Create(const AlgProto& conf) {
+  auto factory = Singleton<Factory<singa::Worker>>::Instance();
+  Worker* worker = nullptr;
+  if (conf.has_user_alg())
+    worker = factory->Create(conf.user_alg());
+  else
+    worker = factory->Create(conf.alg());
+  return worker;
+}
+
+void Worker::Setup(int grp_id, int id, const JobProto& conf,
+    NeuralNet* train_net, NeuralNet* val_net, NeuralNet* test_net) {
+  grp_id_ = grp_id;
+  id_ = id;
+  job_conf_ = conf;
+  train_net_ = train_net;
+  val_net_ = val_net;
+  test_net_ = test_net;
+  layer_dealer_ = dealer_ = nullptr;
+}
+
+Worker::~Worker() {
+  if (layer_dealer_)
+    delete layer_dealer_;
+  if (dealer_)
+    delete dealer_;
+}
+
+void Worker::InitNetParams(const JobProto& job_conf, NeuralNet* net) {
+  // for each server grp, its first subscriber worker grp does the param init
+  if (grp_id_ % Cluster::Get()->nworker_groups_per_server_group() == 0) {
+    // extract params that should be initialized by this worker
+    // must gen a name for each param if the user doesn't config it
+    std::unordered_map<string, Param*> name2param;
+    for (auto layer : net->layers()) {
+      if (layer->partition_id() == id_) {
+        for (auto param : layer->GetParams()) {
+          // only owners fill the memory of parameter values.
+          if (param->owner() == param->id()) {
+            CHECK(name2param.find(param->name()) == name2param.end());
+            name2param[param->name()] = param;
+          }
+        }
+      }
+    }
+    // load from checkpoints. get param blob based on param name.
+    // the param from previous checkpoint files will be overwritten by
+    // the param with the same name in later checkpoint files.
+    for (const auto path : job_conf.checkpoint_path()) {
+      LOG(ERROR) << "Load from checkpoint file " << path;
+      BlobProtos bps;
+      ReadProtoFromBinaryFile(path.c_str(), &bps);
+      for (int i = 0; i < bps.name_size(); i++) {
+        if (name2param.find(bps.name(i)) != name2param.end()) {
+          name2param.at(bps.name(i))->FromProto(bps.blob(i));
+          //  if load from pre-training params, reset version to start step
+          if (job_conf.reset_param_version())
+            name2param.at(bps.name(i))->set_version(job_conf.step());
+          else  // if resume training, use the same version as last checkpoint
+            name2param.at(bps.name(i))->set_version(bps.version(i));
+        }
+      }
+    }
+    // init other params who do not have checkpoint version
+    for (auto entry : name2param)
+      if (entry.second->version() < 0) {
+        entry.second->InitValues(job_conf.step());
+        if (!job_conf.reset_param_version())
+          LOG(ERROR) << "better reset version of params from checkpoints "
+            << "to the same as other newly initialized params!";
+      }
+
+    // warmup training before put params to servers
+    for (; step_ < job_conf.warmup_steps(); step_++)
+      TrainOneBatch(step_, net);
+    for (auto layer : net->layers()) {
+      if (layer->partition_id() == id_)
+        for (auto param : layer->GetParams())
+          if (param->owner() == param->id())
+            Put(param->version(), param);
+    }
+  }
+  // wait owners in the same procs init params, then no get requests sent
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  for (auto layer : net->layers()) {
+    if (layer->partition_id() == id_)
+      for (auto param : layer->GetParams())
+        Get(job_conf.warmup_steps(), param);
+  }
+}
+
+void ConnectStub(int grp, int id, Dealer* dealer, EntityType entity) {
+  dealer->Connect(kInprocRouterEndpoint);
+  Msg* ping = new Msg(Addr(grp, id, entity), Addr(-1, -1, kStub));
+  ping->set_type(kConnect);
+  dealer->Send(&ping);
+}
+
+void Worker::Run() {
+  LOG(ERROR) << "Worker (group = " << grp_id_ <<", id = " << id_ << ") start";
+  auto cluster = Cluster::Get();
+  int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group();
+  CHECK(cluster->runtime()->JoinSGroup(grp_id_, id_, svr_grp));
+  // TODO(wangsh): provide a unique sock id from cluster
+  dealer_ = new Dealer(0);
+  ConnectStub(grp_id_, id_, dealer_, kWorkerParam);
+  for (auto layer : train_net_->layers()) {
+    if (layer->partition_id() == id_) {
+      if (typeid(layer) == typeid(BridgeDstLayer)
+          || typeid(layer) == typeid(BridgeSrcLayer)) {
+        // TODO(wangsh): provide a unique socket id from cluster
+        layer_dealer_ = new Dealer(1);
+        ConnectStub(grp_id_, id_, layer_dealer_, kWorkerLayer);
+        break;
+      }
+    }
+  }
+
+  step_ = job_conf_.step();
+  InitNetParams(job_conf_, train_net_);
+  while (!StopNow(step_)) {
+    if (ValidateNow(step_) && val_net_ != nullptr) {
+      CollectAll(step_, val_net_);
+      for (int step = 0; step < job_conf_.validate_steps(); step++)
+        TestOneBatch(step, kVal, val_net_);
+      Display(kVal, "Validation @ step " + std::to_string(step_), val_net_);
+    }
+    if (TestNow(step_) && test_net_ != nullptr) {
+      CollectAll(step_, test_net_);
+      for (int step = 0; step < job_conf_.test_steps(); step++)
+        TestOneBatch(step, kTest, test_net_);
+      Display(kTest, "Test @ step " + std::to_string(step_), test_net_);
+    }
+    if (CheckpointNow(step_) && grp_id_ == 0) {
+      CollectAll(step_, train_net_);
+      Checkpoint(step_, Cluster::Get()->checkpoint_folder(), train_net_);
+      job_conf_.set_step(step_);
+    }
+    TrainOneBatch(step_, train_net_);
+    if (DisplayNow(step_) && grp_id_ == 0 && id_ == 0)
+      Display(kTrain, "Train @ step " + std::to_string(step_), train_net_);
+    step_++;
+  }
+
+  // save the model
+  if (grp_id_ == 0)
+    Checkpoint(step_, Cluster::Get()->checkpoint_folder(), train_net_);
+  // clean up
+  cluster->runtime()->LeaveSGroup(grp_id_, id_, svr_grp);
+  // notify the stub on worker stop
+  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
+  msg->set_type(kStop);
+  dealer_->Send(&msg);  // use param dealer to send the stop msg
+  LOG(ERROR) << "Worker (group = " <<grp_id_ << ", id = " << id_ << ") stops";
+}
+
+void Worker::Checkpoint(int step, const std::string& folder, NeuralNet* net) {
+  BlobProtos bps;
+  for (auto layer : net->layers()) {
+    if (layer->partition_id() == id_) {
+      for (auto param : layer->GetParams()) {
+        // only owners fill the memory of parameter values.
+        if (param->owner() == param->id()) {
+          auto *blob = bps.add_blob();
+          param->ToProto(blob);
+          bps.add_version(param->version());
+          bps.add_name(param->name());
+        }
+      }
+    }
+  }
+  char buf[256];
+  snprintf(buf, sizeof(buf), "%s/step%d-worker%d", folder.c_str(), step, id_);
+  LOG(INFO) << "checkpoint to " << buf;
+  WriteProtoToBinaryFile(bps, buf);
+}
+
+int Worker::Put(int step, Param* param) {
+  if (dealer_ == nullptr) {
+    LOG(ERROR) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
+    return 1;
+  }
+  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
+  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
+  msg->set_type(kPut);
+  dealer_->Send(&msg);
+  return 1;
+}
+
+int Worker::Get(int step, Param* param) {
+  if (param->version() >= step)
+    return 1;
+  if (dealer_ == nullptr) {
+    LOG(ERROR) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
+    return 1;
+  }
+  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
+  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
+  msg->set_type(kGet);
+  dealer_->Send(&msg);
+  return 1;
+}
+
+int Worker::Update(int step, Param* param) {
+  param->set_local_version(param->version());
+  if (dealer_ == nullptr) {
+    LOG(ERROR) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
+    return 1;
+  }
+  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
+  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
+  msg->set_type(kUpdate);
+  dealer_->Send(&msg);
+  return 1;
+}
+
+int Worker::CollectAll(int step, NeuralNet* net) {
+  auto& layers = net->layers();
+  for (auto& layer : layers) {
+    if (layer->partition_id() == id_) {
+      for (Param* p : layer->GetParams()) {
+        Collect(step, p);
+      }
+    }
+  }
+  return 1;
+}
+
+int Worker::Collect(int step, Param* param) {
+  while (param->version() <= param->local_version())
+    std::this_thread::sleep_for(std::chrono::milliseconds(kCollectSleepTime));
+  return 1;
+}
+
+void Worker::Display(int flag, const std::string& prefix, NeuralNet* net) {
+  for (auto layer : net->layers()) {
+    if (layer->partition_id() == id_) {
+      const string& disp = layer->ToString(false, flag);
+      if (disp.length())
+        LOG(ERROR) << prefix << ": " << disp;
+      if (job_conf_.debug()) {
+        const string& info = layer->ToString(true, flag);
+        if (info.length()) {
+          LOG(INFO) <<  prefix << info;
+        }
+      }
+    }
+  }
+}
+
+void Worker::ReceiveBlobs(bool data, bool grad, BridgeLayer* layer,
+                          NeuralNet* net) {
+  if (layer_dealer_ == nullptr) {
+    LOG(ERROR) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
+  }
+  while (!layer->ready()) {
+    auto msg = layer_dealer_->Receive();
+    CHECK_EQ(AddrGrp(msg->src()), grp_id_);
+    string name(static_cast<char*>(msg->FrameData()), msg->FrameSize());
+    auto receive_layer = net->name2layer(name);
+    auto data = receive_layer->mutable_data(nullptr);
+    msg->NextFrame();
+    memcpy(data->mutable_cpu_data(), msg->FrameData(), msg->FrameSize());
+    dynamic_cast<BridgeLayer*>(receive_layer)->set_ready(true);
+    delete msg;
+  }
+}
+
+void Worker::SendBlobs(bool data, bool grad, BridgeLayer* layer,
+                       NeuralNet* net) {
+  if (layer_dealer_ == nullptr) {
+    LOG(ERROR) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
+  }
+  auto dst = net->srclayers(layer).at(0);
+  Msg *msg = new Msg();
+  msg->set_src(Addr(grp_id_, id_, kWorkerLayer));
+  msg->set_dst(Addr(grp_id_, dst->partition_id(), kWorkerLayer));
+  msg->AddFrame(dst->name().c_str(), dst->name().length());
+  auto const & blob = layer->data(nullptr);
+  msg->AddFrame(blob.cpu_data(), blob.count() * sizeof(float));
+  layer_dealer_->Send(&msg);
+}
+
+/****************************BPWorker**********************************/
+void BPWorker::TrainOneBatch(int step, NeuralNet* net) {
+  Forward(step, kTrain, net);
+  Backward(step, net);
+}
+
+void BPWorker::TestOneBatch(int step, Phase phase, NeuralNet* net) {
+  Forward(step, phase, net);
+}
+
+void BPWorker::Forward(int step, Phase phase, NeuralNet* net) {
+  for (auto& layer : net->layers()) {
+    if (layer->partition_id() == id_) {
+      // TODO(wangwei): enable this for model partition
+      // recv data from other workers
+      // if (typeid(*layer) == typeid(BridgeDstLayer))
+      //   ReceiveBlobs(true, false, dynamic_cast<BridgeLayer*>(layer), net);
+      if (phase == kTrain) {
+        // wait until param is updated
+        for (Param* p : layer->GetParams()) {
+          Collect(step, p);
+        }
+      }
+      layer->ComputeFeature(phase | kForward, net->srclayers(layer));
+      // TODO(wangwei): enable this for model partition
+      // send data to other workers
+      // if (typeid(*layer) == typeid(BridgeSrcLayer))
+      //   SendBlobs(true, false, dynamic_cast<BridgeLayer*>(layer), net);
+    }
+  }
+}
+
+void BPWorker::Backward(int step, NeuralNet* net) {
+  auto& layers = net->layers();
+  for (auto it = layers.rbegin(); it != layers.rend(); it++) {
+    Layer* layer = *it;
+    if (layer->partition_id() == id_) {
+      // TODO(wangwei): enable this for model partition
+      // send data to other workers
+      // if (typeid(layer) == typeid(BridgeSrcLayer))
+      //   ReceiveBlobs(false, true, layer, net);
+      layer->ComputeGradient(kTrain | kBackward, net->srclayers(layer));
+      for (Param* p : layer->GetParams())
+        Update(step, p);
+      // TODO(wangwei): enable this for model partition
+      // recv data from other workers
+      // if (typeid(layer) == typeid(BridgeDstLayer))
+      //   SendBlobs(false, true, dynamic_cast<BridgeDstLayer*>(layer), net);
+    }
+  }
+}
+
+/****************************CDWorker**********************************/
+void CDWorker::TrainOneBatch(int step, NeuralNet* net) {
+  const auto& layers = net->layers();
+  for (auto* layer : layers) {
+    for (Param* p : layer->GetParams())  // wait until param is updated
+      Collect(step, p);
+    layer->ComputeFeature(kPositive, net->srclayers(layer));
+  }
+  for (auto* layer : layers)
+    if (typeid(*layer) == typeid(RBMVisLayer)
+          || typeid(*layer) == typeid(RBMHidLayer))
+      layer->ComputeFeature(kNegative | kTest, net->srclayers(layer));
+  for (int i = 1; i < job_conf_.train_one_batch().cd_conf().cd_k(); i++) {
+    for (auto* layer : layers) {
+      if (typeid(*layer) == typeid(RBMVisLayer)
+          || typeid(*layer) == typeid(RBMHidLayer))
+      layer->ComputeFeature(kNegative, net->srclayers(layer));
+    }
+  }
+  for (auto* layer : layers) {
+    if (typeid(*layer) == typeid(RBMVisLayer)
+        || typeid(*layer) == typeid(RBMHidLayer)) {
+      layer->ComputeGradient(kTrain, net->srclayers(layer));
+      for (Param* p : layer->GetParams()) {
+        Update(step, p);
+      }
+    }
+  }
+}
+
+void CDWorker::TestOneBatch(int step, Phase phase, NeuralNet* net) {
+  auto& layers = net->layers();
+  for (auto *layer : layers)
+    layer->ComputeFeature(kPositive, net->srclayers(layer));
+  for (auto *layer : layers)
+    if (typeid(*layer) == typeid(RBMVisLayer))
+      layer->ComputeFeature(kNegative | kTest, net->srclayers(layer));
+}
+
+}  // namespace singa


[03/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

* Update driver.cc to remove the dependency on OpenBLAS's cblas.h file due to the use of openblas_set_num_threads().
* Update README.md file to add comments and instructions for users to
install and try one example.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6bb1a8a4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6bb1a8a4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6bb1a8a4

Branch: refs/heads/master
Commit: 6bb1a8a4ad713c157bf0617f9c5651094f9cf9f5
Parents: 3d1b0dc
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Wed Sep 23 21:24:05 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:23:41 2015 +0800

----------------------------------------------------------------------
 README.md     | 107 +++++++++++++++++++++++++++++++++++++++++++++++------
 RELEASE_NOTES |   6 +--
 src/driver.cc |   3 +-
 3 files changed, 100 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bb1a8a4/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index b7d4528..080f4f5 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,13 @@ All the details can be found in [Project Website](http://singa.incubator.apache.
 
 ##Dependencies
 The current code depends on the following external libraries:
+
   * glog (New BSD)
   * google-protobuf (New BSD)
   * openblas (New BSD)
   * zeromq (LGPLv3 + static link exception)
   * czmq (Mozilla Public License Version 2.0)
   * zookeeper (Apache 2.0)
-  * lmdb (OpenLDAP)
 
 ##Documentation
 
@@ -28,10 +28,11 @@ Full documentation is available online at [Official Documentation](https://singa
 
 
 ##Building SINGA
-	
-	$ ./autogen.sh
-	$ ./configure
-	$ make
+
+    $ ./autogen.sh (optional)
+    # pls refer to FAQ for solutions of errors
+    $ ./configure
+    $ make
 
 ##Running Examples
 
@@ -43,19 +44,101 @@ at [CNN example](http://singa.incubator.apache.org/docs/cnn).
 
 First, download the dataset and create data shards:
 
-	$ cd examples/cifar10/
-	$ make download
-	$ make create
+    $ cd examples/cifar10/
+    $ cp Makefile.example Makefile
+    $ make download
+    $ make create
+
+If it reports errors due to libopenblas.so missing, then include the
+lib folder of OpenBLAS in LD_LIBRARY_PATH
 
-Next, start the training: 
+    $ export LD_LIBRARY_PATH= OPENBLAS_FOLDER/lib:$LD_LIBRARY_PATH
+    # delete the newly created folders
+    $ rm -rf cifar10_t*
+    $ make create
 
-	$ cd ../../
-    	$ ./bin/singa-run.sh -conf examples/cifar10/job.conf
+Next, start the training:
 
-Now we just need to wait until it is done! 
+    $ cd ../../
+    $ ./bin/zk-service.sh start
+    $ ./bin/singa-run.sh -conf examples/cifar10/job.conf
+
+Now we just need to wait until it is done!
 
 ##LICENSE
 
 Apache Singa is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
 
 For additional information, see the LICENSE and NOTICE files.
+
+## FAQ
+
+* Q1:I get error `./configure --> cannot find blas_segmm() function` even I
+run `install.sh OpenBLAS`.
+
+  A1: `OpenBLAS` library is installed in `/opt` folder by default or
+  other folders if you use `sudo apt-get install`.
+  You need to include the OpenBLAS library folder in the LDFLAGS.
+
+      $ export LDFLAGS=-L/opt/OpenBLAS/lib
+
+  Alternatively, you can include the path in LIBRARY_PATH.
+
+
+* Q2: I get error `cblas.h not such file or directory exists`.
+
+  Q2: You need to include the folder of the cblas.h (e.g., /opt/OpenBLAS/include)
+  into CPLUS_INCLUDE_PATH
+
+      $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+      # reconfigure and make
+      $ ./configure
+      $ make
+
+
+* Q3:While compiling SINGA, I get error `SSE2 instruction set not enabled`
+
+  A3:You can try following command:
+
+      $ make CFLAGS='-msse2' CXXFLAGS='-msse2'
+
+
+* Q4:I get `ImportError: cannot import name enum_type_wrapper` from
+google.protobuf.internal when I try to import .py files.
+
+  A4:After install google protobuf by `make install`, we should install python
+  runtime libraries. Go to protobuf source directory, run:
+
+      $ cd /PROTOBUF/SOURCE/FOLDER
+      $ cd python
+      $ python setup.py build
+      $ python setup.py install
+
+  You may need `sudo` when you try to install python runtime libraries in
+  the system folder.
+
+
+* Q5: I get a linking error caused by gflags.
+
+  A5: SINGA does not depend on gflags. But you may have installed the glog with
+  gflags. In that case you can reinstall glog using *thirdparty/install.sh* into
+  a another folder and export the LDFLAGS and CPPFLAGS to include that folder.
+
+
+* Q6: While compiling SINGA and installing `glog` on mac OS X, I get fatal error
+`'ext/slist' file not found`
+
+  A6:Please install `glog` individually and try :
+
+      $ make CFLAGS='-stdlib=libstdc++' CXXFLAGS='stdlib=libstdc++'
+
+* Q7: When I start a training job, it reports error related with "ZOO_ERROR...zk retcode=-4...".
+
+  A7: This is because the zookeeper is not started. Please start the zookeeper service
+
+      $ ./bin/zk-service start
+
+  If the error still exists, probably that you do not have java. You can simple
+  check it by
+
+      $ java --version

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bb1a8a4/RELEASE_NOTES
----------------------------------------------------------------------
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 2425cd0..8c5eae0 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,9 +1,9 @@
-Release Notes - SINGA - Version singa-incubating-0.1-rc0
+Release Notes - SINGA - Version singa-incubating-0.1-rc1
 -----------------------------------------
 
 SINGA is a general distributed deep learning platform for training big deep learning models over large datasets. It is
 designed with an intuitive programming model based on the layer abstraction. SINGA supports a wide variety of popular
-deep learning models. 
+deep learning models.
 
 This release includes following features:
 
@@ -75,7 +75,7 @@ Some bugs are fixed during the development of this release
   * [SINGA-49] Fix a bug in HandlePutMsg func that sets param fields to invalid values
   * [SINGA-66] Fix bugs in Worker::RunOneBatch function and ClusterProto
 
-Features planned for the next release 
+Features planned for the next release
   * [SINGA-11] Start SINGA using Mesos
   * [SINGA-31] Extend Blob to support xpu (cpu or gpu)
   * [SINGA-35] Add random number generators

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6bb1a8a4/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 41b2342..6fa70ee 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -21,7 +21,6 @@
 
 #include "driver.h"
 
-#include <cblas.h>
 #include <glog/logging.h>
 #include <string>
 #include "neuralnet/layer.h"
@@ -29,6 +28,8 @@
 #include "utils/common.h"
 #include "utils/tinydir.h"
 
+extern "C" void openblas_set_num_threads(int);
+
 namespace singa {
 
 void Driver::Init(int argc, char **argv) {


[08/13] incubator-singa git commit: SINGA-70 Refactor API of Layer, Worker, Server and Driver

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/neuralnet/neuron_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/neuron_layer.h b/include/neuralnet/neuron_layer.h
index 6c4647d..51ba304 100644
--- a/include/neuralnet/neuron_layer.h
+++ b/include/neuralnet/neuron_layer.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -38,9 +38,9 @@ class ConvolutionLayer : public NeuronLayer {
  public:
   ~ConvolutionLayer();
 
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
   const std::vector<Param*> GetParams() const override {
     std::vector<Param*> params{weight_, bias_};
     return params;
@@ -63,15 +63,15 @@ class ConvolutionLayer : public NeuronLayer {
  */
 class CConvolutionLayer : public ConvolutionLayer {
  public:
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 class DropoutLayer : public NeuronLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
  protected:
   // drop probability
   float pdrop_;
@@ -90,9 +90,9 @@ class DropoutLayer : public NeuronLayer {
  * b_i, the neuron after normalization, N is the total num of kernels
  */
 class LRNLayer : public NeuronLayer {
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  protected:
   //! shape of the bottom layer feature
@@ -106,9 +106,9 @@ class LRNLayer : public NeuronLayer {
 
 class PoolingLayer : public NeuronLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  protected:
   int kernel_, pad_, stride_;
@@ -121,26 +121,26 @@ class PoolingLayer : public NeuronLayer {
  */
 class CPoolingLayer : public PoolingLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions);
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers);
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
  private:
   Blob<float> mask_;
 };
 
 class ReLULayer : public NeuronLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 class InnerProductLayer : public NeuronLayer {
  public:
   ~InnerProductLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
   const std::vector<Param*> GetParams() const override {
     std::vector<Param*> params{weight_, bias_};
     return params;
@@ -159,9 +159,9 @@ class InnerProductLayer : public NeuronLayer {
  */
 class STanhLayer : public NeuronLayer {
  public:
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric *perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 /**
@@ -174,19 +174,19 @@ class SigmoidLayer: public Layer {
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 };
 
 
 /**
  * Base layer for RBM models.
  */
-class RBMLayer: public Layer {
+class RBMLayer: virtual public Layer {
  public:
   virtual ~RBMLayer() {}
-  void Setup(const LayerProto& proto, int npartitions) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
   const Blob<float>& neg_data(const Layer* layer) {
     return neg_data_;
   }
@@ -218,12 +218,12 @@ class RBMLayer: public Layer {
 /**
  * RBM visible layer
  */
-class RBMVisLayer: public RBMLayer {
+class RBMVisLayer: public RBMLayer, public LossLayer {
  public:
   ~RBMVisLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  private:
   RBMLayer* hid_layer_;
@@ -235,9 +235,9 @@ class RBMVisLayer: public RBMLayer {
 class RBMHidLayer: public RBMLayer {
  public:
   ~RBMHidLayer();
-  void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(int flag, Metric* perf) override;
-  void ComputeGradient(int flag, Metric* perf) override;
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
 
  private:
   RBMLayer *vis_layer_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/server.h
----------------------------------------------------------------------
diff --git a/include/server.h b/include/server.h
new file mode 100644
index 0000000..4b75430
--- /dev/null
+++ b/include/server.h
@@ -0,0 +1,133 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_SERVER_H_
+#define SINGA_SERVER_H_
+
+#include <unordered_map>
+#include <vector>
+#include "comm/socket.h"
+#include "proto/job.pb.h"
+#include "utils/param.h"
+#include "utils/updater.h"
+
+namespace singa {
+
+ /* Repsond to worker's get/put/udpate request, and periodically syncing with
+  * other servers.
+  *
+  * Normally, the Server creates a response message for each request which
+  * will be sent back to the one who issued the request. However, if the request
+  * are not processed successfully, the original message will be returned. The
+  * sever does not know the returned message is a response or the original
+  * message. It just sends it to the router. The router will decided to
+  * re-send the request to the server or send it to the worker.
+  */
+class Server {
+ public:
+  ~Server();
+  Server(int group_id, int server_id,
+      const JobProto& job_conf,
+      const std::vector<int>& slice2group,
+      const std::vector<int>& slice2server);
+  void Run();
+  inline int grp_id() const { return grp_id_; }
+  inline int id() const { return id_; }
+
+ protected:
+  /**
+   * Process GET request.
+   *
+   * @return the orignal message or a response message which contains the values
+   * of the Param with the request version.
+   */
+  Msg* HandleGet(Msg** msg);
+  /**
+   * Process Update request.
+   *
+   * It waits until received the gradients from all workers from the same worker
+   * group. After updating, it responses to each sender with the new Param
+   * values. It may generate a sync message to the server group that maintains
+   * the global version of the updated Param (slice).
+   *
+   * Note: there is no counter for each worker group on the number of received
+   * update requests. Hence it is possible that the server would conduct the
+   * update when it receives x requests from group a and y requests from group
+   * b where x + y = group size. To avoid this problem, we can
+   * -# maintain request list for each group for each Param at the server side
+   * -# do not span a worker group among multiple nodes. then the updates from
+   * the same group would be locally aggregated on the worker node. And the
+   * server would conduct the update immediately after receiving the aggregated
+   * request.
+   * -# launch only one worker group.
+   *
+   * @return the orignal message or response message
+   */
+  const std::vector<Msg*> HandleUpdate(Msg **msg);
+  /**
+   * Process PUT request.
+   *
+   * @return the original message or response message. If we don't want to
+   * acknowledge the put request, then return nullptr.
+   */
+  Msg* HandlePut(Msg **msg);
+  /**
+   * Handle sync request from other server groups.
+   *
+   * It adds updates of Param (slice) from other server groups directly to
+   * local Param (slice). Currently, each Param (slice) has a master group,
+   * i.e., slice2group_[sliceid], which would receive such requests from all
+   * other server groups for the Param object.
+   *
+   * @param msg request msg containing the parameter updates
+   * @return response msg that contains the fresh parameter values.
+   */
+  Msg* HandleSyncRequest(Msg** msg);
+  /**
+   * Handle sync response.
+   *
+   * The response msg includes the latest values of a Param object from the
+   * server group that maintainers this Param object.
+   * The local Param values are replaced with the addition result of local
+   * udpates since the sync request was sent and the received Param values.
+   *
+   * @param response message
+   */
+  void HandleSyncResponse(Msg** msg);
+
+ protected:
+  int grp_id_ = -1;
+  int id_ = -1;
+  Updater* updater_ = nullptr;
+  //!< map from slice ID to slice and deleted in the destructor
+  std::unordered_map<int, ParamEntry*> shard_;
+  std::vector<int> slice2group_, slice2server_;
+  //!< num of updates from last sync with master server group for a param/slice
+  std::vector<int> n_updates_;
+  //!< num of sync requests that have not been responded
+  std::vector<int> n_pending_sync_;
+  std::vector<Blob<float>> last_sync_;
+  std::unordered_map<int, std::vector<Msg*>> buffer_requests_;
+};
+
+}  // namespace singa
+
+#endif  // SINGA_SERVER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/singa.h
----------------------------------------------------------------------
diff --git a/include/singa.h b/include/singa.h
index d4ee557..6c801ab 100644
--- a/include/singa.h
+++ b/include/singa.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,16 +22,15 @@
 #ifndef SINGA_SINGA_H_
 #define SINGA_SINGA_H_
 
-#include "communication/socket.h"
+#include "comm/socket.h"
 #include "neuralnet/neuralnet.h"
 #include "neuralnet/layer.h"
 #include "proto/job.pb.h"
 #include "proto/singa.pb.h"
-#include "trainer/trainer.h"
 #include "utils/common.h"
 #include "utils/param.h"
 #include "utils/singleton.h"
 #include "utils/factory.h"
-#include "driver.h"
+#include "./driver.h"
 
 #endif  // SINGA_SINGA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/stub.h
----------------------------------------------------------------------
diff --git a/include/stub.h b/include/stub.h
new file mode 100644
index 0000000..719f033
--- /dev/null
+++ b/include/stub.h
@@ -0,0 +1,109 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_STUB_H_
+#define SINGA_STUB_H_
+
+#include <queue>
+#include <unordered_map>
+#include <vector>
+#include <string>
+#include "comm/socket.h"
+#include "neuralnet/neuralnet.h"
+#include "proto/job.pb.h"
+#include "proto/singa.pb.h"
+#include "utils/factory.h"
+#include "utils/param.h"
+#include "utils/singleton.h"
+#include "./server.h"
+#include "./worker.h"
+
+namespace singa {
+
+class Stub {
+ public:
+  ~Stub();
+  /**
+   * Find an endpoint to bind.
+   */
+  void Setup();
+  /**
+   * The Stub instance runs this function in the main thread to handle (e.g.,
+   * forward) messages from workers and servers.
+   *
+   * @param[in] slice2server the k-th value is the ID of the server that is in
+   * charge of updating the Param slice with ID k. Large Param objects are
+   * sliced into subsets for load-balance. Different subsets are updated by
+   * different servers.
+   */
+  void Run(const vector<int>& slice2server,
+      const std::vector<Worker*>& workers,
+      const std::vector<Server*>& servers);
+
+  const std::string& endpoint() const {
+    return endpoint_;
+  }
+
+ protected:
+  /**
+   * Create a socket to send msg to the specified process
+   * @param dst_procs the dst process (logical) ID
+   * @return the newly created socket
+   */
+  Dealer* CreateInterProcsDealer(int dst_procs);
+  /**
+   * Generate a request message to Get the parameter object.
+   */
+  const std::vector<Msg*> HandleGetRequest(ParamEntry* entry, Msg** msg);
+  void HandleGetResponse(ParamEntry* entry, Msg** msg);
+  /**
+   * Generate a request message to Update the parameter object.
+   */
+  const std::vector<Msg*> HandleUpdateRequest(ParamEntry* entry, Msg** msg);
+  /**
+   * Handle response msg from servers for the update requests.
+   */
+  void HandleUpdateResponse(ParamEntry* entry, Msg** msg);
+  /**
+   * Generate a request message to Put the parameter object.
+   */
+  const std::vector<Msg*> HandlePutRequest(ParamEntry* entry, Msg** msg);
+  /**
+   * Called by HandlePut, HandleUpdate and HandleGet functions
+   * @param type message type
+   * @param version param version
+   * @param entry
+   * @param msg
+   * @param ret generated messages
+   */
+  void GenMsgs(int type, int version, ParamEntry* entry,
+    Msg* msg, std::vector<Msg*> *ret);
+
+
+ protected:
+  Router *router_ = nullptr;
+  std::string endpoint_;
+  std::vector<int> slice2server_;
+};
+
+}  // namespace singa
+
+#endif  // SINGA_STUB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/trainer/server.h
----------------------------------------------------------------------
diff --git a/include/trainer/server.h b/include/trainer/server.h
deleted file mode 100644
index 84b3a41..0000000
--- a/include/trainer/server.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_TRAINER_SERVER_H_
-#define SINGA_TRAINER_SERVER_H_
-
-#include <unordered_map>
-#include <vector>
-#include "communication/socket.h"
-#include "proto/job.pb.h"
-#include "utils/param.h"
-#include "utils/updater.h"
-
-namespace singa {
-
- /* Repsond to worker's get/put/udpate request, and periodically syncing with
-  * other servers.
-  *
-  * Normally, the Server creates a response message for each request which
-  * will be sent back to the one who issued the request. However, if the request
-  * are not processed successfully, the original message will be returned. The
-  * sever does not know the returned message (response or the original message),
-  * it just sends it to the router. The router will decide to re-send the
-  * request to the server or send it to the worker.
-  */
-class Server {
- public:
-  Server(int group_id, int server_id);
-  ~Server();
-  void Setup(const UpdaterProto& proto, const std::vector<int>& slice2group,
-             const std::vector<int>& slice2server);
-  void Run();
-  inline int grp_id() const { return grp_id_; }
-  inline int id() const { return id_; }
-
- protected:
-  /**
-   * Process GET request.
-   *
-   * @return the orignal message or a response message which contains the values
-   * of the Param with the request version.
-   */
-  Msg* HandleGet(Msg** msg);
-  /**
-   * Process Update request.
-   *
-   * It waits until received the gradients from all workers from the same worker
-   * group. After updating, it responses to each sender with the new Param
-   * values. It may generate a sync message to the server group that maintains
-   * the global version of the updated Param (slice).
-   *
-   * Note: there is no counter for each worker group on the number of received
-   * update requests. Hence it is possible that the server would conduct the
-   * update when it receives x requests from group a and y requests from group
-   * b where x + y = group size. To avoid this problem, we can
-   * 1. maintain request list for each group for each Param at the server side
-   * 2. do not span a worker group among multiple nodes. then the updates from
-   * the same group would be locally aggregated on the worker node. And the
-   * server would conduct the update immediately after receiving the aggregated
-   * request.
-   * 3. launch only one worker group.
-   *
-   * @return the orignal message or response message
-   */
-  const std::vector<Msg*> HandleUpdate(Msg **msg);
-  /**
-   * Process PUT request.
-   *
-   * @return the original message or response message. If we don't want to
-   * acknowledge the put request, then return nullptr.
-   */
-  Msg* HandlePut(Msg **msg);
-  /**
-   * Handle sync request from other server groups.
-   *
-   * It adds updates of Param (slice) from other server groups directly to
-   * local Param (slice). Currently, each Param (slice) has a master group,
-   * i.e., slice2group_[sliceid], which would receive such requests from all
-   * other server groups for the Param object.
-   *
-   * @param msg request msg containing the parameter updates
-   * @return response msg that contains the fresh parameter values.
-   */
-  Msg* HandleSyncRequest(Msg** msg);
-  /**
-   * Handle sync response.
-   *
-   * The response msg includes the latest values of a Param object, for which
-   * this server sent the sync request to the master/maintainer group.
-   * The local Param values are replaced with the addition result of local
-   * udpates since the sync request was sent and the received Param values.
-   *
-   * @param response message
-   */
-  void HandleSyncResponse(Msg** msg);
-
- protected:
-  int grp_id_ = -1;
-  int id_ = -1;
-  Updater* updater_ = nullptr;
-  //!< map from slice ID to slice and deleted in the destructor
-  std::unordered_map<int, ParamEntry*> shard_;
-  std::vector<int> slice2group_, slice2server_;
-  //!< num of updates from last sync with master server group for a param/slice
-  std::vector<int> n_updates_;
-  //!< num of sync requests that have not been responded
-  std::vector<int> n_pending_sync_;
-  std::vector<Blob<float>> last_sync_;
-  std::unordered_map<int, std::vector<Msg*>> buffer_requests_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_TRAINER_SERVER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/trainer/trainer.h
----------------------------------------------------------------------
diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h
deleted file mode 100644
index 1c0e039..0000000
--- a/include/trainer/trainer.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_TRAINER_TRAINER_H_
-#define SINGA_TRAINER_TRAINER_H_
-
-#include <queue>
-#include <unordered_map>
-#include <vector>
-#include "communication/socket.h"
-#include "neuralnet/neuralnet.h"
-#include "proto/job.pb.h"
-#include "proto/singa.pb.h"
-#include "trainer/server.h"
-#include "trainer/worker.h"
-#include "utils/factory.h"
-#include "utils/param.h"
-#include "utils/singleton.h"
-
-namespace singa {
-
-/**
- * Every running process has a training object which launches one or more
- * worker (and server) threads.
- *
- * The main thread runs a loop to forward messages between workers and servers.
- */
-class Trainer{
- public:
-  ~Trainer();
-  /**
-   * Entrance function which construct the workers and servers, and luanch
-   * one thread per worker/server.
-   *
-   * @param resume if true resume the training from the latest checkpoint files
-   * @param singaConf global singa configuration including zookeeper and
-   * @param jobConf job configuration, including cluster and model configuration
-   */
-  void Start(bool resume, const SingaProto& singaConf, JobProto* jobConf);
-
- protected:
-  /**
-   * Setting the checkpoint field of model configuration to resume training.
-   *
-   * The checkpoint folder will be searched to get the files for the latest
-   * checkpoint, which will be added into the checkpoint field. The workers
-   * would then load the values of params from the checkpoint files.
-   *
-   * @param jobConf job configuration
-   */
-  void Resume(JobProto* jobConf);
-  /**
-   * Create server instances.
-   * @param nthread total num of threads in current procs which is used to
-   * assign each thread a local thread ID. The number of workers is extracted
-   * from Cluster
-   * @param jobConf
-   * @return server instances
-   */
-  std::vector<Server*> CreateServers(const JobProto& jobConf);
-  /**
-   * Create workers instances.
-   * @param nthread total num of threads in current procs which is used to
-   * assign each thread a local thread ID. The number of workers is extracted
-   * from Cluster
-   * @param jobConf
-   * @return worker instances
-   */
-  std::vector<Worker*> CreateWorkers(const JobProto& jobConf);
-  /**
-   * Setup workers and servers.
-   *
-   * For each worker, create and assign a neuralnet to it.
-   * For each server, create and assign the param shard to it.
-   * Create the partition map from slice ID to server
-   * @param modelConf
-   * @param workers
-   * @param servers
-   */
-  void SetupWorkerServer(const JobProto& jobConf,
-                         const std::vector<Worker*>& workers,
-                         const std::vector<Server*>& servers);
-  void Run(const std::vector<Worker*>& workers,
-           const std::vector<Server*>& servers);
-  /**
-   * Display metrics to log (standard output)
-   */
-  void DisplayMetric(Msg** msg);
-  /**
-   * Create a socket to send msg to the specified process
-   * @param dst_procs the dst process (logical) ID
-   * @return the newly created socket
-   */
-  Dealer* CreateInterProcsDealer(int dst_procs);
-  /**
-   * Handle messages to local servers and local stub
-   */
-  void HandleLocalMsg(std::queue<Msg*>* msg_queue, Msg** msg);
-  /**
-   * Generate a request message to Get the parameter object.
-   */
-  const std::vector<Msg*> HandleGet(ParamEntry* entry, Msg** msg);
-  void HandleGetResponse(ParamEntry* entry, Msg** msg);
-  /**
-   * Generate a request message to Update the parameter object.
-   */
-  const std::vector<Msg*> HandleUpdate(ParamEntry* entry, Msg** msg);
-  void HandleUpdateResponse(ParamEntry* entry, Msg** msg);
-  /**
-   * Generate a request message to Put the parameter object.
-   */
-  const std::vector<Msg*> HandlePut(ParamEntry* entry, Msg** msg);
-  /**
-   * Called by HandlePut, HandleUpdate and HandleGet functions
-   * @param type message type
-   * @param version param version
-   * @param entry
-   * @param msg
-   * @param ret generated messages
-   */
-  void GenMsgs(int type, int version, ParamEntry* entry,
-    Msg* msg, std::vector<Msg*> *ret);
-  /**
-   * Get a hash id for a Param object from a group.
-   *
-   * Simple multiple group_id with a large prime number 997 (assuming there are
-   * no more than 997 worker groups) and plus owner param id.
-   */
-  inline int Hash(int grp_id, int param_id) {
-    return grp_id * 997 + param_id;
-  }
-
- protected:
-  int procs_id_ = -1;
-  Router *router_ = nullptr;
-  std::unordered_map<int, ParamEntry*> worker_shard_;
-  //!< map from slice to the server that updates it
-  std::vector<int> slice2server_;
-  // a buffer of created nets, will destroy them all in destructor
-  std::vector<NeuralNet*> nets_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_TRAINER_TRAINER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
deleted file mode 100644
index 66439ec..0000000
--- a/include/trainer/worker.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_TRAINER_WORKER_H_
-#define SINGA_TRAINER_WORKER_H_
-
-#include <string>
-#include "communication/socket.h"
-#include "neuralnet/neuralnet.h"
-#include "proto/job.pb.h"
-
-namespace singa {
-
-//!< sleep 5 milliseconds if the Param is not updated to the expected version
-const int kCollectSleepTime = 5;
-/**
- * The Worker class which runs the training algorithm.
- * The first worker group will initialize parameters of the Net,
- * and put them into the distributed memory/table.
- * The virtual function TrainOneBatch and TestOneBatch implement the
- * training and test algorithm for one mini-batch data.
- *
- * Child workers override the two functions to implement their training
- * algorithms, e.g., the BPWorker/CDWorker/BPTTWorker implements the BP/CD/BPTT
- * algorithm respectively.
- */
-class Worker {
- public:
-  static Worker* Create(const JobProto& proto);
-  /**
-   * @param thread_id local thread index within the procs
-   * @param grp_id global worker group ID
-   * @param id worker ID within the group
-   */
-  virtual void Init(int grp_id, int id);
-  virtual ~Worker();
-  /**
-   * Setup members
-   */
-  void Setup(const JobProto& job, NeuralNet* train_net, NeuralNet* valid_net,
-             NeuralNet* test_net);
-  /**
-   * Init all local params (i.e., params from layers resident in this worker).
-   *
-   * If the param is owned by the worker, then init it and put it to servers.
-   * Otherwise call Get() to get the param. The Get may not send get request.
-   * Because the param's own is in the same procs. Once the owner initializes
-   * the param, its version is visiable to all shares.
-   * If the training starts from scrath, the params are initialzed using random
-   * distributions, e.g., Gaussian distribution. After that, the worker may
-   * train for a couple of steps to warmup the params before put
-   * them to servers (warmup of JobProto controls this).
-   *
-   * If the owner param is available from checkpoint file, then its
-   * values are parsed from the checkpoint file instead of randomly initialized.
-   * For params who do not have checkpoints, randomly init them.
-   */
-  void InitLocalParams();
-  /**
-    * Main function of Worker.
-    *
-    * Train the neuralnet step by step, test/validation is done periodically.
-    */
-  void Run();
-  /**
-   * Checkpoint all params owned by the worker from the first group onto disk.
-   * The serialization is done using BlobProtos which includes the name, version
-   * and values of each Param.
-   * Different worker would generate different checkpoint files. The file path
-   * is <workspace>/checkpoint-<jobname>-step<step>-worker<worker_id>.bin
-   * @param step training step of this worker
-   * @param net the training net whose params will be dumped.
-   */
-  void Checkpoint(int step, NeuralNet* net);
-  /**
-    * Test the perforance of the learned model on validation or test dataset.
-    * Test is done by the first group.
-    * @param net, neural network
-    */
-  void Test(int nsteps, Phase phase, NeuralNet* net);
-  /**
-    * Train one mini-batch.
-    * Test/Validation is done before training.
-    */
-  virtual void TrainOneBatch(int step, Metric* perf) = 0;
-  /**
-   * Test/validate one mini-batch.
-   */
-  virtual void TestOneBatch(int step, Phase phase, NeuralNet* net,
-                            Metric* perf) = 0;
-  /**
-   * Report performance to the stub.
-   *
-   * @param prefix display prefix, e.g., 'Train', 'Test'
-   * @param perf
-   */
-  void Report(const std::string& prefix, const Metric & perf);
-  /**
-   * Put Param to server.
-   * @param param
-   * @param step used as current param version for the put request
-   */
-  int Put(Param* param, int step);
-  /**
-   * Get Param with specific version from server
-   * If the current version >= the requested version, then return.
-   * Otherwise send a get request to stub who would forwards it to servers.
-   * @param param
-   * @param step requested param version
-   */
-  int Get(Param* param, int step);
-  /**
-   * Update Param
-   * @param param
-   * @param step training step used for updating (e.g., deciding learning rate)
-   */
-  int Update(Param* param, int step);
-  /**
-   * Block until the param is updated since sending the update request
-   *
-   * @param param
-   * @param step not used
-   */
-  int Collect(Param* param, int step);
-  /**
-   * Call Collect for every param of net
-   */
-  int CollectAll(NeuralNet* net, int step);
-  /**
-   * Receive blobs from other workers due to model partitions.
-   */
-  void ReceiveBlobs(bool data, bool grad, BridgeLayer* layer, NeuralNet* net);
-  /**
-   * Send blobs to other workers due to model partitions.
-   */
-  void SendBlobs(bool data, bool grad, BridgeLayer* layer, NeuralNet* net);
-  /**
-   * Check is it time to display training info, e.g., loss and precison.
-   */
-  inline bool DisplayNow(int step) const {
-    return job_conf_.disp_freq() > 0
-           && step >= job_conf_.disp_after()
-           && ((step - job_conf_.disp_after()) % job_conf_.disp_freq() == 0);
-  }
-  /**
-   * Check is it time to display training info, e.g., loss and precison.
-   */
-  inline bool DisplayDebugInfo(int step) const {
-    return DisplayNow(step) && job_conf_.debug() && grp_id_ == 0;
-  }
-  /**
-   * Check is it time to stop
-   */
-  inline bool StopNow(int step) const {
-    return step >= job_conf_.train_steps();
-  }
-  /**
-   * Check is it time to do checkpoint.
-   */
-  inline bool CheckpointNow(int step) const {
-    return grp_id_ == 0
-           && job_conf_.checkpoint_freq() > 0
-           && step >= job_conf_.checkpoint_after()
-           && ((step - job_conf_.checkpoint_after())
-              % job_conf_.checkpoint_freq() == 0);
-  }
-  /**
-   * Check is it time to do test.
-   * @param step the ::Train() has been called this num times.
-   */
-  inline bool TestNow(int step) const {
-    return grp_id_ == 0
-           && job_conf_.test_freq() > 0
-           && job_conf_.test_steps() > 0
-           && step >= job_conf_.test_after()
-           && ((step - job_conf_.test_after()) % job_conf_.test_freq() == 0);
-  }
-  /**
-   * Check is it time to do validation.
-   * @param step the ::Train() has been called step times.
-   */
-  inline bool ValidateNow(int step) const {
-    return grp_id_ == 0
-           && job_conf_.valid_freq() > 0
-           && job_conf_.valid_steps() > 0
-           && step >= job_conf_.valid_after()
-           && ((step - job_conf_.valid_after()) % job_conf_.valid_freq() == 0);
-  }
-  /**
-   * @return group ID
-   */
-  int grp_id() const { return grp_id_; }
-  /**
-   * @reutrn worker ID within the worker group.
-   */
-  int id() const { return id_; }
-
- protected:
-  int grp_id_ = -1, id_ = -1;
-  int step_ = 0;
-  JobProto job_conf_;
-  NeuralNet* train_net_ = nullptr;
-  NeuralNet* test_net_ = nullptr;
-  NeuralNet* validation_net_ = nullptr;
-  Dealer* layer_dealer_ = nullptr;
-  Dealer* dealer_ = nullptr;
-};
-
-class BPWorker: public Worker {
- public:
-  void TrainOneBatch(int step, Metric* perf) override;
-  void TestOneBatch(int step, Phase phase, NeuralNet* net, Metric* perf)
-      override;
-  void Forward(int step, Phase phase, NeuralNet* net, Metric* perf);
-  void Backward(int step, NeuralNet* net);
-};
-
-class CDWorker: public Worker {
- public:
-  void TrainOneBatch(int step, Metric* perf) override;
-  void TestOneBatch(int step, Phase phase, NeuralNet* net, Metric* perf)
-      override;
-};
-
-inline int BlobTrgt(int grp, int layer) {
-  return (grp << 16) | layer;
-}
-
-inline int BlobGrp(int blob_trgt) {
-  return blob_trgt >> 16;
-}
-
-inline int BlobLayer(int blob_trgt) {
-  static int mask = (1 << 16) -1;
-  return blob_trgt & mask;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_TRAINER_WORKER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index e6c8c7c..f690438 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -25,12 +25,13 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "communication/msg.h"
+
+#include "comm/msg.h"
 #include "proto/job.pb.h"
 #include "utils/blob.h"
 
 namespace singa {
-
+using std::vector;
 /**
  * Base parameter generator which intializes parameter values.
  */
@@ -92,7 +93,34 @@ class UniformSqrtFanInOutGen : public UniformGen {
  */
 class Param {
  public:
-  static Param* Create(const ParamProto& proto);
+  /**
+   * Create an instance of (sub) Param class based on the type from the
+   * configuration.
+   *
+   * @param[in] conf configuration
+   * @param a pointer to an instance
+   */
+  static Param* Create(const ParamProto& conf);
+
+  /**
+   * Try to slice the Param objects (from a neural net) into a given number of
+   * servers (groups) evenly. This is to achieve load-balance among servers.
+   *
+   * It does not change the Param objects, but just computes the length of each
+   * slice.
+   *
+   * @param num number of servers (groups) for maintaining the Param objects.
+   * @param params all Param objects from a neural net.
+   * @return the length of each slice.
+   */
+  static const vector<int> ComputeSlices(int num, const vector<Param*>& params);
+  /**
+   * It computes the length of each slice and slices the Param objects by adding
+   * the slicing information into every Param object.
+   *
+   * @copydetails ComputeSlices()
+   */
+  static void SliceParams(int num, const vector<Param*>& params);
 
   Param() {}
   virtual ~Param() {}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/include/worker.h
----------------------------------------------------------------------
diff --git a/include/worker.h b/include/worker.h
new file mode 100644
index 0000000..58f02c4
--- /dev/null
+++ b/include/worker.h
@@ -0,0 +1,311 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_WORKER_H_
+#define SINGA_WORKER_H_
+
+#include <string>
+#include <vector>
+#include "comm/socket.h"
+#include "neuralnet/neuralnet.h"
+#include "proto/job.pb.h"
+
+namespace singa {
+
+//!< sleep 5 milliseconds if the Param is not updated to the expected version
+const int kCollectSleepTime = 5;
+/**
+ * The Worker class which runs the training algorithm.
+ * The first worker group will initialize parameters of the Net,
+ * and put them into the distributed memory/table.
+ * The virtual function TrainOneBatch and TestOneBatch implement the
+ * training and test algorithm for one mini-batch data.
+ *
+ * Child workers override the two functions to implement their training
+ * algorithms, e.g., the BPWorker/CDWorker/BPTTWorker implements the BP/CD/BPTT
+ * algorithm respectively.
+ */
+class Worker {
+ public:
+  /**
+   * Create an instance of the subclass of Worker.
+   *
+   * @param[in] conf configuration of the TrainOneBatch algorithm. Different
+   * Worker subclasses implement different algorithms. Hence the creation is
+   * based on the TrainOneBatch algorithm type. Currently SINGA
+   * provides two algorithms:
+   * -# Back-propagation for the feed-forward models, e.g., CNN and MLP, and the
+   *  recurrent neural networks.
+   * -# Contrastive divergence for the energy models, e.g., RBM.
+   *
+   * @return a pointer to the instance of the Worker subclass.
+   */
+  static Worker* Create(const AlgProto& conf);
+  virtual ~Worker();
+  /**
+   * @param[in] grp_id global worker group ID
+   * @param[in] id worker ID within the group
+   * @param[in] conf job configuration
+   * @param[in] train_net pointer to the training neural net, which could be
+   * shared with other workers from the same group. Different workers run over
+   * differnt subset of layers.
+   * @param[in] val_net pointer to the validation neural net. Currently only the
+   * first worker from the first group would have validation neural net. All
+   * other workers receive nullptr for this argument.
+   * @param[in] test_net pointer to the test neural net. Currently only the
+   * first worker from the first group would have test neural net. All other
+   * workers receive nullptr for this argument.
+   */
+  virtual void Setup(int grp_id, int id, const JobProto& conf,
+      NeuralNet* train_net, NeuralNet* val_net, NeuralNet* test_net);
+
+  /**
+   * Main function of Worker.
+   *
+   * Train the neuralnet step by step, test/validation is done periodically.
+   */
+  void Run();
+
+  /**
+   * Init values of Param instances assocaited with local layers (i.e., layers
+   * dispatched to this worker).
+   *
+   * If one Param is owned by the worker, then it should be initialized and put
+   * to servers. Otherwise Get() should be called to get the Param. The Get()
+   * may not send get requests if the Param owner is in the same procs, for
+   * which case the memory space of the Param objects are shared. But if this
+   * worker and the Param owner worker run on different devices (e.g., GPUs),
+   * then the get request would be sent.
+   *
+   * If the training starts from scrath, every Param object is initialzed using
+   * ParamGenerator. After that, the worker may
+   * train for a couple of steps to warmup the params before put
+   * them to servers (warmup of JobProto controls this).
+   *
+   * If one Param object's name matches that of one Param object from the
+   * checkpoint files, its Param values would be loaded from checkpoint files.
+   *
+   * @param[in] job_conf job configuration which provides settings for
+   * checkpoint file paths, warmup steps and Param versions.
+   * @param[out] net pointer to a neural net whose Param values will be
+   * initialized.
+   */
+  void InitNetParams(const JobProto& job_conf, NeuralNet* net);
+
+  /**
+   * Checkpoint all Param objects owned by the worker onto disk.
+   * The serialization is done using BlobProtos which includes the name, version
+   * and values of each Param object.
+   * Different workers would generate different checkpoint files. The file path
+   * is <workspace>/checkpoint-<jobname>-step<step>-worker<worker_id>.bin
+   * @param[in] step training step
+   * @param[in] folder directory to put the checkpoint file
+   * @param net the training net whose Param objects will be dumped.
+   */
+  void Checkpoint(int step, const std::string& folder, NeuralNet* net);
+
+  /**
+    * Train one mini-batch.
+    * Test/Validation is done before training.
+    *
+    * @param[in] step training step.
+    * @param[in] net neural net to be trained.
+    */
+  virtual void TrainOneBatch(int step, NeuralNet* net) = 0;
+
+  /**
+   * Test/validate one mini-batch data.
+   *
+   * @param[in] step test step.
+   * @param[in] phase test could be done for validation or test phase.
+   * @param[in] net neural net for test
+   */
+  virtual void TestOneBatch(int step, Phase phase, NeuralNet* net) = 0;
+
+  /**
+   * Display infomation from layers.
+   *
+   * @param flag could be a combination of multiple phases, e.g, kTest|kForward,
+   * it is passed to the Layer::ToString() function for each layer to decide
+   * what to display .
+   * @param prefix display prefix, e.g., 'Train step 100', 'Test step 90'.
+   * @param net display layers from this neural net.
+   */
+  void Display(int flag, const std::string& prefix, NeuralNet* net);
+
+  /**
+   * Put Param values to server.
+   *
+   * @param param
+   * @param step used as current param version for the put request
+   */
+  int Put(int step, Param* param);
+
+  /**
+   * Get Param with specific version from server
+   * If the current version >= the requested version, then return.
+   * Otherwise send a get request to stub who would forwards it to servers.
+   * @param param
+   * @param step requested param version
+   */
+  int Get(int step, Param* param);
+
+  /**
+   * Update Param.
+   *
+   * @param param
+   * @param step training step used for updating (e.g., deciding learning rate).
+   */
+  int Update(int step, Param* param);
+
+  /**
+   * Wait for the response of the update/get requests.
+   *
+   * @param param
+   * @param step not used now.
+   */
+  int Collect(int step, Param* param);
+
+  /**
+   * Call Collect() for every param of net
+   */
+  int CollectAll(int step, NeuralNet* net);
+
+  /**
+   * Receive blobs from other workers due to model partitions.
+   */
+  void ReceiveBlobs(bool data, bool grad, BridgeLayer* layer, NeuralNet* net);
+
+  /**
+   * Send blobs to other workers due to model partitions.
+   */
+  void SendBlobs(bool data, bool grad, BridgeLayer* layer, NeuralNet* net);
+
+
+  /**
+   * @param[in] step
+   * @return true if it is time to display training info, e.g., loss; otherwise
+   * false.
+   */
+  inline bool DisplayNow(int step) const {
+    return job_conf_.disp_freq() > 0
+           && step >= job_conf_.disp_after()
+           && ((step - job_conf_.disp_after()) % job_conf_.disp_freq() == 0);
+  }
+  /**
+   * @param[in] step
+   * @return true if it is time to finish the training; otherwise false.
+   */
+  inline bool StopNow(int step) const {
+    return step >= job_conf_.train_steps();
+  }
+  /**
+   * @param[in] step
+   * @return true if it is time to do checkpoint Param objects; otherwise false.
+   */
+  inline bool CheckpointNow(int step) const {
+    return job_conf_.checkpoint_freq() > 0
+           && step >= job_conf_.checkpoint_after()
+           && ((step - job_conf_.checkpoint_after())
+              % job_conf_.checkpoint_freq() == 0);
+  }
+  /**
+   * @param[in] step
+   * @return true if it is time to do test over the test dataset.
+   */
+  inline bool TestNow(int step) const {
+    return job_conf_.test_freq() > 0
+      && job_conf_.test_steps() > 0
+      && step >= job_conf_.test_after()
+      && ((step - job_conf_.test_after()) % job_conf_.test_freq() == 0);
+  }
+  /**
+   * @param[in] step
+   * @return true if it is time to do test over the validation dataset.
+   */
+  inline bool ValidateNow(int step) const {
+    return job_conf_.validate_freq() > 0
+      && job_conf_.validate_steps() > 0
+      && step >= job_conf_.validate_after()
+      && ((step - job_conf_.validate_after()) % job_conf_.validate_freq() == 0);
+  }
+  /**
+   * @return a vector with pointers to all neural nets.
+   */
+  const std::vector<NeuralNet*> GetNets() const {
+    return std::vector<NeuralNet*> {train_net_, val_net_, test_net_};
+  }
+  /**
+   * @return training net.
+   */
+  inline NeuralNet* train_net() const {
+    return train_net_;
+  }
+  /**
+   * @return group ID
+   */
+  inline int grp_id() const { return grp_id_; }
+  /**
+   * @reutrn worker ID within the worker group.
+   */
+  inline int id() const { return id_; }
+
+ protected:
+  int grp_id_ = -1, id_ = -1;
+  int step_ = 0;
+  JobProto job_conf_;
+  NeuralNet* train_net_ = nullptr;
+  NeuralNet* test_net_ = nullptr;
+  NeuralNet* val_net_ = nullptr;
+  Dealer* layer_dealer_ = nullptr;
+  Dealer* dealer_ = nullptr;
+};
+
+class BPWorker: public Worker {
+ public:
+  void TrainOneBatch(int step, NeuralNet* net) override;
+  void TestOneBatch(int step, Phase phase, NeuralNet* net) override;
+  void Forward(int step, Phase phase, NeuralNet* net);
+  void Backward(int step, NeuralNet* net);
+};
+
+class CDWorker: public Worker {
+ public:
+  void TrainOneBatch(int step, NeuralNet* net) override;
+  void TestOneBatch(int step, Phase phase, NeuralNet* net) override;
+};
+
+inline int BlobTrgt(int grp, int layer) {
+  return (grp << 16) | layer;
+}
+
+inline int BlobGrp(int blob_trgt) {
+  return blob_trgt >> 16;
+}
+
+inline int BlobLayer(int blob_trgt) {
+  static int mask = (1 << 16) -1;
+  return blob_trgt & mask;
+}
+
+}  // namespace singa
+
+#endif  // SINGA_WORKER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/comm/msg.cc
----------------------------------------------------------------------
diff --git a/src/comm/msg.cc b/src/comm/msg.cc
new file mode 100644
index 0000000..2521c28
--- /dev/null
+++ b/src/comm/msg.cc
@@ -0,0 +1,215 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "comm/msg.h"
+
+#include <glog/logging.h>
+
+namespace singa {
+
+#ifdef USE_ZMQ
+Msg::~Msg() {
+  if (msg_ != nullptr)
+    zmsg_destroy(&msg_);
+  frame_ = nullptr;
+}
+
+Msg::Msg() {
+  msg_ = zmsg_new();
+}
+
+Msg::Msg(const Msg& msg) {
+  src_ = msg.src_;
+  dst_ = msg.dst_;
+  type_ = msg.type_;
+  trgt_val_ = msg.trgt_val_;
+  trgt_version_ = msg.trgt_version_;
+  msg_ = zmsg_dup(msg.msg_);
+}
+
+Msg::Msg(int src, int dst) {
+  src_ = src;
+  dst_ = dst;
+  msg_ = zmsg_new();
+}
+
+void Msg::SwapAddr() {
+  std::swap(src_, dst_);
+}
+
+int Msg::size() const {
+  return zmsg_content_size(msg_);
+}
+
+void Msg::AddFrame(const void* addr, int nBytes) {
+  zmsg_addmem(msg_, addr, nBytes);
+}
+
+int Msg::FrameSize() {
+  return zframe_size(frame_);
+}
+
+void* Msg::FrameData() {
+  return zframe_data(frame_);
+}
+
+char* Msg::FrameStr() {
+  return zframe_strdup(frame_);
+}
+bool Msg::NextFrame() {
+  frame_ = zmsg_next(msg_);
+  return frame_ != nullptr;
+}
+
+void Msg::FirstFrame() {
+  frame_ = zmsg_first(msg_);
+}
+
+void Msg::LastFrame() {
+  frame_ = zmsg_last(msg_);
+}
+
+void Msg::ParseFromZmsg(zmsg_t* msg) {
+  char* tmp = zmsg_popstr(msg);
+  sscanf(tmp, "%d %d %d %d %d",
+         &src_, &dst_, &type_, &trgt_val_, &trgt_version_);
+  frame_ = zmsg_first(msg);
+  msg_ = msg;
+}
+
+zmsg_t* Msg::DumpToZmsg() {
+  zmsg_pushstrf(msg_, "%d %d %d %d %d",
+      src_, dst_, type_, trgt_val_, trgt_version_);
+  zmsg_t *tmp = msg_;
+  msg_ = nullptr;
+  return tmp;
+}
+
+// frame marker indicating this frame is serialize like printf
+#define FMARKER "*singa*"
+
+#define kMaxFrameLen 2048
+
+int Msg::AddFormatFrame(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  int size = strlen(FMARKER);
+  char dst[kMaxFrameLen];
+  memcpy(dst, FMARKER, size);
+  dst[size++] = 0;
+  while (*format) {
+    if (*format == 'i') {
+      int x = va_arg(argptr, int);
+      dst[size++] = 'i';
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else if (*format == 'f') {
+      float x = static_cast<float> (va_arg(argptr, double));
+      dst[size++] = 'f';
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else if (*format == '1') {
+      uint8_t x = va_arg(argptr, int);
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else if (*format == '2') {
+      uint16_t x = va_arg(argptr, int);
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else if (*format == '4') {
+      uint32_t x = va_arg(argptr, uint32_t);
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else if (*format == 's') {
+      char* x = va_arg(argptr, char *);
+      dst[size++] = 's';
+      memcpy(dst + size, x, strlen(x));
+      size += strlen(x);
+      dst[size++] = 0;
+    } else if (*format == 'p') {
+      void* x = va_arg(argptr, void *);
+      dst[size++] = 'p';
+      memcpy(dst + size, &x, sizeof(x));
+      size += sizeof(x);
+    } else {
+      LOG(ERROR) << "Unknown format " << *format;
+    }
+    format++;
+    CHECK_LE(size, kMaxFrameLen);
+  }
+  va_end(argptr);
+  zmsg_addmem(msg_, dst, size);
+  return size;
+}
+
+int Msg::ParseFormatFrame(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  char* src = zframe_strdup(frame_);
+  CHECK_STREQ(FMARKER, src);
+  int size = strlen(FMARKER) + 1;
+  while (*format) {
+    if (*format == 'i') {
+      int *x = va_arg(argptr, int *);
+      CHECK_EQ(src[size++], 'i');
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else if (*format == 'f') {
+      float *x = va_arg(argptr, float *);
+      CHECK_EQ(src[size++], 'f');
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else if (*format == '1') {
+      uint8_t *x = va_arg(argptr, uint8_t *);
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else if (*format == '2') {
+      uint16_t *x = va_arg(argptr, uint16_t *);
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else if (*format == '4') {
+      uint32_t *x = va_arg(argptr, uint32_t *);
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else if (*format == 's') {
+      char* x = va_arg(argptr, char *);
+      CHECK_EQ(src[size++], 's');
+      int len = strlen(src + size);
+      memcpy(x, src + size, len);
+      x[len] = 0;
+      size += len + 1;
+    } else if (*format == 'p') {
+      void** x = va_arg(argptr, void **);
+      CHECK_EQ(src[size++], 'p');
+      memcpy(x, src + size, sizeof(*x));
+      size += sizeof(*x);
+    } else {
+      LOG(ERROR) << "Unknown format type " << *format;
+    }
+    format++;
+  }
+  va_end(argptr);
+  delete src;
+  return size;
+}
+#endif
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/comm/socket.cc
----------------------------------------------------------------------
diff --git a/src/comm/socket.cc b/src/comm/socket.cc
new file mode 100644
index 0000000..b9c7810
--- /dev/null
+++ b/src/comm/socket.cc
@@ -0,0 +1,180 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "comm/socket.h"
+
+#include <glog/logging.h>
+
+namespace singa {
+
+#ifdef USE_ZMQ
+Poller::Poller() {
+  poller_ = zpoller_new(nullptr);
+}
+
+Poller::Poller(SocketInterface* socket) {
+  poller_ = zpoller_new(nullptr);
+  Add(socket);
+}
+
+void Poller::Add(SocketInterface* socket) {
+  zsock_t* zsock = static_cast<zsock_t*>(socket->InternalID());
+  zpoller_add(poller_, zsock);
+  zsock2Socket_[zsock] = socket;
+}
+
+SocketInterface* Poller::Wait(int timeout) {
+  zsock_t* sock = static_cast<zsock_t*>(zpoller_wait(poller_, timeout));
+  if (sock != nullptr)
+    return zsock2Socket_[sock];
+  else
+  return nullptr;
+}
+
+bool Poller::Terminated() {
+  return zpoller_terminated(poller_);
+}
+
+
+Dealer::Dealer() : Dealer(-1) {}
+
+Dealer::Dealer(int id) : id_(id) {
+  dealer_ = zsock_new(ZMQ_DEALER);
+  CHECK_NOTNULL(dealer_);
+}
+
+Dealer::~Dealer() {
+  zsock_destroy(&dealer_);
+}
+
+int Dealer::Connect(const std::string& endpoint) {
+  CHECK_GT(endpoint.length(), 0);
+  if (endpoint.length()) {
+    CHECK_EQ(zsock_connect(dealer_, "%s", endpoint.c_str()), 0);
+    return 1;
+  }
+  return 0;
+}
+
+int Dealer::Send(Msg** msg) {
+  zmsg_t* zmsg = (*msg)->DumpToZmsg();
+  zmsg_send(&zmsg, dealer_);
+  delete *msg;
+  *msg = nullptr;
+  return 1;
+}
+
+Msg* Dealer::Receive() {
+  zmsg_t* zmsg = zmsg_recv(dealer_);
+  if (zmsg == nullptr)
+    return nullptr;
+  Msg* msg = new Msg();
+  msg->ParseFromZmsg(zmsg);
+  return msg;
+}
+
+void* Dealer::InternalID() const {
+  return dealer_;
+}
+
+Router::Router() : Router(100) {}
+
+Router::Router(int bufsize) {
+  nBufmsg_ = 0;
+  bufsize_ = bufsize;
+  router_ = zsock_new(ZMQ_ROUTER);
+  CHECK_NOTNULL(router_);
+  poller_ = zpoller_new(router_);
+  CHECK_NOTNULL(poller_);
+}
+
+Router::~Router() {
+  zsock_destroy(&router_);
+  for (auto it : id2addr_)
+    zframe_destroy(&it.second);
+  for (auto it : bufmsg_) {
+    for (auto *msg : it.second)
+      zmsg_destroy(&msg);
+  }
+}
+int Router::Bind(const std::string& endpoint) {
+  int port = -1;
+  if (endpoint.length()) {
+    port = zsock_bind(router_, "%s", endpoint.c_str());
+  }
+  CHECK_NE(port, -1) << endpoint;
+  LOG(INFO) << "bind successfully to " << endpoint + ":" + std::to_string(port);
+  return port;
+}
+
+int Router::Send(Msg **msg) {
+  zmsg_t* zmsg = (*msg)->DumpToZmsg();
+  int dstid = (*msg)->dst();
+  if (id2addr_.find(dstid) != id2addr_.end()) {
+    // the connection has already been set up
+    zframe_t* addr = zframe_dup(id2addr_[dstid]);
+    zmsg_prepend(zmsg, &addr);
+    zmsg_send(&zmsg, router_);
+  } else {
+    // the connection is not ready, buffer the message
+    if (bufmsg_.size() == 0)
+      nBufmsg_ = 0;
+    bufmsg_[dstid].push_back(zmsg);
+    ++nBufmsg_;
+    CHECK_LE(nBufmsg_, bufsize_);
+  }
+  delete *msg;
+  *msg = nullptr;
+  return 1;
+}
+
+Msg* Router::Receive() {
+  zmsg_t* zmsg = zmsg_recv(router_);
+  if (zmsg == nullptr) {
+    LOG(ERROR) << "Connection broken!";
+    exit(0);
+  }
+  zframe_t* dealer = zmsg_pop(zmsg);
+  Msg* msg = new Msg();
+  msg->ParseFromZmsg(zmsg);
+  if (id2addr_.find(msg->src()) == id2addr_.end()) {
+    // new connection, store the sender's identfier and send buffered messages
+    // for it
+    id2addr_[msg->src()] = dealer;
+    if (bufmsg_.find(msg->src()) != bufmsg_.end()) {
+      for (auto& it : bufmsg_.at(msg->src())) {
+        zframe_t* addr = zframe_dup(dealer);
+        zmsg_prepend(it, &addr);
+        zmsg_send(&it, router_);
+      }
+      bufmsg_.erase(msg->src());
+    }
+  } else {
+    zframe_destroy(&dealer);
+  }
+  return msg;
+}
+
+void* Router::InternalID() const {
+  return router_;
+}
+#endif
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/communication/msg.cc
----------------------------------------------------------------------
diff --git a/src/communication/msg.cc b/src/communication/msg.cc
deleted file mode 100644
index 6042057..0000000
--- a/src/communication/msg.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "communication/msg.h"
-
-#include <glog/logging.h>
-
-namespace singa {
-
-#ifdef USE_ZMQ
-Msg::~Msg() {
-  if (msg_ != nullptr)
-    zmsg_destroy(&msg_);
-  frame_ = nullptr;
-}
-
-Msg::Msg() {
-  msg_ = zmsg_new();
-}
-
-Msg::Msg(const Msg& msg) {
-  src_ = msg.src_;
-  dst_ = msg.dst_;
-  type_ = msg.type_;
-  trgt_val_ = msg.trgt_val_;
-  trgt_version_ = msg.trgt_version_;
-  msg_ = zmsg_dup(msg.msg_);
-}
-
-Msg::Msg(int src, int dst) {
-  src_ = src;
-  dst_ = dst;
-  msg_ = zmsg_new();
-}
-
-void Msg::SwapAddr() {
-  std::swap(src_, dst_);
-}
-
-int Msg::size() const {
-  return zmsg_content_size(msg_);
-}
-
-void Msg::AddFrame(const void* addr, int nBytes) {
-  zmsg_addmem(msg_, addr, nBytes);
-}
-
-int Msg::FrameSize() {
-  return zframe_size(frame_);
-}
-
-void* Msg::FrameData() {
-  return zframe_data(frame_);
-}
-
-char* Msg::FrameStr() {
-  return zframe_strdup(frame_);
-}
-bool Msg::NextFrame() {
-  frame_ = zmsg_next(msg_);
-  return frame_ != nullptr;
-}
-
-void Msg::FirstFrame() {
-  frame_ = zmsg_first(msg_);
-}
-
-void Msg::LastFrame() {
-  frame_ = zmsg_last(msg_);
-}
-
-void Msg::ParseFromZmsg(zmsg_t* msg) {
-  char* tmp = zmsg_popstr(msg);
-  sscanf(tmp, "%d %d %d %d %d",
-         &src_, &dst_, &type_, &trgt_val_, &trgt_version_);
-  frame_ = zmsg_first(msg);
-  msg_ = msg;
-}
-
-zmsg_t* Msg::DumpToZmsg() {
-  zmsg_pushstrf(msg_, "%d %d %d %d %d",
-      src_, dst_, type_, trgt_val_, trgt_version_);
-  zmsg_t *tmp = msg_;
-  msg_ = nullptr;
-  return tmp;
-}
-
-// frame marker indicating this frame is serialize like printf
-#define FMARKER "*singa*"
-
-#define kMaxFrameLen 2048
-
-int Msg::AddFormatFrame(const char *format, ...) {
-  va_list argptr;
-  va_start(argptr, format);
-  int size = strlen(FMARKER);
-  char dst[kMaxFrameLen];
-  memcpy(dst, FMARKER, size);
-  dst[size++] = 0;
-  while (*format) {
-    if (*format == 'i') {
-      int x = va_arg(argptr, int);
-      dst[size++] = 'i';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == 'f') {
-      float x = static_cast<float> (va_arg(argptr, double));
-      dst[size++] = 'f';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '1') {
-      uint8_t x = va_arg(argptr, int);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '2') {
-      uint16_t x = va_arg(argptr, int);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '4') {
-      uint32_t x = va_arg(argptr, uint32_t);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == 's') {
-      char* x = va_arg(argptr, char *);
-      dst[size++] = 's';
-      memcpy(dst + size, x, strlen(x));
-      size += strlen(x);
-      dst[size++] = 0;
-    } else if (*format == 'p') {
-      void* x = va_arg(argptr, void *);
-      dst[size++] = 'p';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else {
-      LOG(ERROR) << "Unknown format " << *format;
-    }
-    format++;
-    CHECK_LE(size, kMaxFrameLen);
-  }
-  va_end(argptr);
-  zmsg_addmem(msg_, dst, size);
-  return size;
-}
-
-int Msg::ParseFormatFrame(const char *format, ...) {
-  va_list argptr;
-  va_start(argptr, format);
-  char* src = zframe_strdup(frame_);
-  CHECK_STREQ(FMARKER, src);
-  int size = strlen(FMARKER) + 1;
-  while (*format) {
-    if (*format == 'i') {
-      int *x = va_arg(argptr, int *);
-      CHECK_EQ(src[size++], 'i');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == 'f') {
-      float *x = va_arg(argptr, float *);
-      CHECK_EQ(src[size++], 'f');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '1') {
-      uint8_t *x = va_arg(argptr, uint8_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '2') {
-      uint16_t *x = va_arg(argptr, uint16_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '4') {
-      uint32_t *x = va_arg(argptr, uint32_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == 's') {
-      char* x = va_arg(argptr, char *);
-      CHECK_EQ(src[size++], 's');
-      int len = strlen(src + size);
-      memcpy(x, src + size, len);
-      x[len] = 0;
-      size += len + 1;
-    } else if (*format == 'p') {
-      void** x = va_arg(argptr, void **);
-      CHECK_EQ(src[size++], 'p');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else {
-      LOG(ERROR) << "Unknown format type " << *format;
-    }
-    format++;
-  }
-  va_end(argptr);
-  delete src;
-  return size;
-}
-#endif
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/communication/socket.cc
----------------------------------------------------------------------
diff --git a/src/communication/socket.cc b/src/communication/socket.cc
deleted file mode 100644
index 60e1cc1..0000000
--- a/src/communication/socket.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "communication/socket.h"
-
-#include <glog/logging.h>
-
-namespace singa {
-
-#ifdef USE_ZMQ
-Poller::Poller() {
-  poller_ = zpoller_new(nullptr);
-}
-
-Poller::Poller(SocketInterface* socket) {
-  poller_ = zpoller_new(nullptr);
-  Add(socket);
-}
-
-void Poller::Add(SocketInterface* socket) {
-  zsock_t* zsock = static_cast<zsock_t*>(socket->InternalID());
-  zpoller_add(poller_, zsock);
-  zsock2Socket_[zsock] = socket;
-}
-
-SocketInterface* Poller::Wait(int timeout) {
-  zsock_t* sock = static_cast<zsock_t*>(zpoller_wait(poller_, timeout));
-  if (sock != nullptr)
-    return zsock2Socket_[sock];
-  else
-  return nullptr;
-}
-
-bool Poller::Terminated() {
-  return zpoller_terminated(poller_);
-}
-
-
-Dealer::Dealer() : Dealer(-1) {}
-
-Dealer::Dealer(int id) : id_(id) {
-  dealer_ = zsock_new(ZMQ_DEALER);
-  CHECK_NOTNULL(dealer_);
-}
-
-Dealer::~Dealer() {
-  zsock_destroy(&dealer_);
-}
-
-int Dealer::Connect(const std::string& endpoint) {
-  CHECK_GT(endpoint.length(), 0);
-  if (endpoint.length()) {
-    CHECK_EQ(zsock_connect(dealer_, "%s", endpoint.c_str()), 0);
-    return 1;
-  }
-  return 0;
-}
-
-int Dealer::Send(Msg** msg) {
-  zmsg_t* zmsg = (*msg)->DumpToZmsg();
-  zmsg_send(&zmsg, dealer_);
-  delete *msg;
-  *msg = nullptr;
-  return 1;
-}
-
-Msg* Dealer::Receive() {
-  zmsg_t* zmsg = zmsg_recv(dealer_);
-  if (zmsg == nullptr)
-    return nullptr;
-  Msg* msg = new Msg();
-  msg->ParseFromZmsg(zmsg);
-  return msg;
-}
-
-void* Dealer::InternalID() const {
-  return dealer_;
-}
-
-Router::Router() : Router(100) {}
-
-Router::Router(int bufsize) {
-  nBufmsg_ = 0;
-  bufsize_ = bufsize;
-  router_ = zsock_new(ZMQ_ROUTER);
-  CHECK_NOTNULL(router_);
-  poller_ = zpoller_new(router_);
-  CHECK_NOTNULL(poller_);
-}
-
-Router::~Router() {
-  zsock_destroy(&router_);
-  for (auto it : id2addr_)
-    zframe_destroy(&it.second);
-  for (auto it : bufmsg_) {
-    for (auto *msg : it.second)
-      zmsg_destroy(&msg);
-  }
-}
-int Router::Bind(const std::string& endpoint) {
-  int port = -1;
-  if (endpoint.length()) {
-    port = zsock_bind(router_, "%s", endpoint.c_str());
-  }
-  CHECK_NE(port, -1) << endpoint;
-  LOG(INFO) << "bind successfully to " << endpoint + ":" + std::to_string(port);
-  return port;
-}
-
-int Router::Send(Msg **msg) {
-  zmsg_t* zmsg = (*msg)->DumpToZmsg();
-  int dstid = (*msg)->dst();
-  if (id2addr_.find(dstid) != id2addr_.end()) {
-    // the connection has already been set up
-    zframe_t* addr = zframe_dup(id2addr_[dstid]);
-    zmsg_prepend(zmsg, &addr);
-    zmsg_send(&zmsg, router_);
-  } else {
-    // the connection is not ready, buffer the message
-    if (bufmsg_.size() == 0)
-      nBufmsg_ = 0;
-    bufmsg_[dstid].push_back(zmsg);
-    ++nBufmsg_;
-    CHECK_LE(nBufmsg_, bufsize_);
-  }
-  delete *msg;
-  *msg = nullptr;
-  return 1;
-}
-
-Msg* Router::Receive() {
-  zmsg_t* zmsg = zmsg_recv(router_);
-  if (zmsg == nullptr) {
-    LOG(ERROR) << "Connection broken!";
-    exit(0);
-  }
-  zframe_t* dealer = zmsg_pop(zmsg);
-  Msg* msg = new Msg();
-  msg->ParseFromZmsg(zmsg);
-  if (id2addr_.find(msg->src()) == id2addr_.end()) {
-    // new connection, store the sender's identfier and send buffered messages
-    // for it
-    id2addr_[msg->src()] = dealer;
-    if (bufmsg_.find(msg->src()) != bufmsg_.end()) {
-      for (auto& it : bufmsg_.at(msg->src())) {
-        zframe_t* addr = zframe_dup(dealer);
-        zmsg_prepend(it, &addr);
-        zmsg_send(&it, router_);
-      }
-      bufmsg_.erase(msg->src());
-    }
-  } else {
-    zframe_destroy(&dealer);
-  }
-  return msg;
-}
-
-void* Router::InternalID() const {
-  return router_;
-}
-#endif
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 6fa70ee..d3f0f3e 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -19,16 +19,17 @@
 *
 *************************************************************/
 
-#include "driver.h"
-
 #include <glog/logging.h>
+#include <set>
 #include <string>
 #include "neuralnet/layer.h"
-#include "trainer/trainer.h"
 #include "utils/common.h"
 #include "utils/tinydir.h"
+#include "utils/cluster.h"
+#include "./stub.h"
+#include "./driver.h"
 
-extern "C" void openblas_set_num_threads(int);
+extern "C" void openblas_set_num_threads(int num);
 
 namespace singa {
 
@@ -109,22 +110,192 @@ void Driver::Init(int argc, char **argv) {
 }
 
 
-void Driver::Submit(bool resume, const JobProto& jobConf) {
+void Driver::Train(bool resume, const JobProto& job_conf) {
+  Cluster::Setup(job_id_, singa_conf_, job_conf.cluster());
   if (singa_conf_.has_log_dir())
-    SetupLog(singa_conf_.log_dir(), std::to_string(job_id_)
-             + "-" + jobConf.name());
+    SetupLog(singa_conf_.log_dir(),
+        std::to_string(job_id_) + "-" + job_conf.name());
   tinydir_dir workspace;
-  if (tinydir_open(&workspace, jobConf.cluster().workspace().c_str()) == -1)
-    LOG(FATAL) << "workspace does not exist: " << jobConf.cluster().workspace();
-  if (jobConf.num_openblas_threads() != 1)
-    LOG(WARNING) << "openblas with "
-                 << jobConf.num_openblas_threads() << " threads";
-  openblas_set_num_threads(jobConf.num_openblas_threads());
+  if (tinydir_open(&workspace, job_conf.cluster().workspace().c_str()) == -1)
+    LOG(FATAL) << "workspace not exist: " << job_conf.cluster().workspace();
+  if (job_conf.num_openblas_threads() != 1)
+    LOG(WARNING) << "openblas luanches "
+                 << job_conf.num_openblas_threads() << " threads";
+  openblas_set_num_threads(job_conf.num_openblas_threads());
+
   JobProto job;
-  job.CopyFrom(jobConf);
+  job.CopyFrom(job_conf);
+  if (resume)
+    SetupForResume(&job);
   job.set_id(job_id_);
-  Trainer trainer;
-  trainer.Start(resume, singa_conf_, &job);
+  Train(job);
 }
 
+void Driver::Train(const JobProto& job_conf) {
+  auto cluster = Cluster::Get();
+  int nserver_grps = cluster->nserver_groups();
+  int grp_size = cluster->nworkers_per_group();
+  Stub stub;
+  // no need to create Stub if there is only a single worker without servers,
+  // i.e., the training will be conducted by the single worker.
+  if (grp_size > 1 || nserver_grps > 0) {
+    stub.Setup();
+    // TODO(wangwei)  register endpoint to zookeeper if > 1 procs;
+    cluster->Register(getpid(), stub.endpoint());  // getpid() is from unistd.h
+  }
+
+  NeuralNet* net = NeuralNet::Create(job_conf.neuralnet(), kTrain, grp_size);
+  const vector<Worker*> workers = CreateWorkers(job_conf, net);
+  const vector<Server*> servers = CreateServers(job_conf, net);
+
+#ifdef USE_MPI
+  int nthreads = workers.size() + servers.size() + 1;
+  for (int i = 0; i < nthreads; i++)
+    MPIQueues.push_back(make_shared<SafeQueue>());
+#endif
+
+  vector<std::thread> threads;
+  for (auto server : servers)
+    threads.push_back(std::thread(&Server::Run, server));
+  for (auto worker : workers)
+    threads.push_back(std::thread(&Worker::Run, worker));
+  if (grp_size > 1 || nserver_grps > 0) {
+    int nservers_per_grp = cluster->nservers_per_group();
+    int lcm = LeastCommonMultiple(nservers_per_grp, nserver_grps);
+    auto slices = Param::ComputeSlices(lcm, net->params());
+    auto slice2server = PartitionSlices(nservers_per_grp, slices);
+    stub.Run(slice2server, workers, servers);
+  }
+
+  for (auto& thread : threads)
+    thread.join();
+  for (auto server : servers)
+    delete server;
+  delete net;
+  std::set<NeuralNet*> deleted{net, nullptr};
+  for (auto worker : workers) {
+    for (auto ptr : worker->GetNets())
+    if (deleted.find(ptr) == deleted.end()) {
+      delete ptr;
+      deleted.insert(ptr);
+    }
+    delete worker;
+  }
+}
+
+void Driver::SetupForResume(JobProto* job_conf) {
+  tinydir_dir dir;
+  std::string folder = Cluster::Get()->checkpoint_folder();
+  tinydir_open(&dir, folder.c_str());
+  int latest_step = 0;
+  // there would be multi checkpoint files (from diff workers) for one step
+  vector<std::string> ck_files;
+  // iterate all files to get the files for the last checkpoint
+  while (dir.has_next) {
+    tinydir_file file;
+    tinydir_readfile(&dir, &file);
+    tinydir_next(&dir);
+    char* ch = strstr(file.name, "step");
+    if (ch == nullptr) {
+      if (file.name[0] != '.')
+        LOG(INFO) << "Irregular file in checkpoint folder: " << file.name;
+      continue;
+    }
+    LOG(INFO) << "Add checkpoint file for resume: " << ch;
+    int step = atoi(ch+4);
+    if (step == latest_step) {
+      ck_files.push_back(file.name);
+    } else if (step > latest_step) {
+      latest_step = step;
+      ck_files.clear();
+      ck_files.push_back(std::string(file.name));
+    }
+  }
+  if (latest_step > 0) {
+    job_conf->set_step(latest_step);
+    if (!job_conf->has_reset_param_version())
+      job_conf->set_reset_param_version(false);
+    job_conf->clear_checkpoint_path();
+    for (auto ck_file : ck_files)
+      job_conf->add_checkpoint_path(folder + "/" + ck_file);
+  }
+  tinydir_close(&dir);
+}
+
+const vector<Worker*> Driver::CreateWorkers(const JobProto& job_conf,
+    NeuralNet* net) {
+  auto cluster = Cluster::Get();
+  vector<Worker*> workers;
+  if (!cluster->has_worker()) return workers;
+  int wgrp_size = cluster->nworkers_per_group();
+  int nservers_per_grp = cluster->nservers_per_group();
+  int nserver_grps = cluster->nserver_groups();
+  int lcm = LeastCommonMultiple(nserver_grps, nservers_per_grp);
+  const vector<int> rng = cluster->ExecutorRng(cluster->procs_id(),
+      cluster->nworkers_per_group(), cluster->nworkers_per_procs());
+  int gstart = rng[0], gend = rng[1], wstart = rng[2], wend = rng[3];
+  for (int gid = gstart; gid < gend; gid++) {
+    NeuralNet* train_net = nullptr, *test_net = nullptr, *val_net = nullptr;
+    if (gid == gstart) {
+      train_net = net;
+      Param::SliceParams(lcm, train_net->params());
+      // test and validation are performed by the 1st group.
+      if (gid == 0 && job_conf.test_steps() > 0) {
+        test_net = NeuralNet::Create(job_conf.neuralnet(), kTest, 1);
+        test_net->ShareParamsFrom(train_net);
+      }
+      if (gid == 0 && job_conf.validate_steps() > 0) {
+        val_net = NeuralNet::Create(job_conf.neuralnet(), kVal, 1);
+        val_net->ShareParamsFrom(train_net);
+      }
+    } else {
+      train_net = NeuralNet::Create(job_conf.neuralnet(), kTrain, wgrp_size);
+      if (cluster->share_memory()) {
+        train_net->ShareParamsFrom(net);
+      } else {
+        Param::SliceParams(lcm, train_net->params());
+      }
+    }
+    for (int wid = wstart; wid < wend; wid++) {
+      auto *worker = Worker::Create(job_conf.train_one_batch());
+      // TODO(wangwei) extend to test among workers in a grp
+      if (wid == 0)
+        worker->Setup(gid, wid, job_conf, train_net, val_net, test_net);
+      else
+        worker->Setup(gid, wid, job_conf, train_net, nullptr, nullptr);
+      workers.push_back(worker);
+    }
+  }
+  return workers;
+}
+
+const vector<Server*> Driver::CreateServers(const JobProto& job_conf,
+    NeuralNet* net) {
+  auto cluster = Cluster::Get();
+  vector<Server*> servers;
+  if (!cluster->has_server()) return servers;
+  int nservers_per_grp = cluster->nservers_per_group();
+  int nserver_grps = cluster->nserver_groups();
+  int lcm = LeastCommonMultiple(nserver_grps, nservers_per_grp);
+  auto slices = Param::ComputeSlices(lcm, net->params());
+  // partition among server groups, each group maintains one sub-set for sync
+  auto slice2group = PartitionSlices(nserver_grps, slices);
+  // partition within one server group, each server updates for one sub-set
+  auto slice2server = PartitionSlices(nservers_per_grp, slices);
+
+  int server_procs = cluster->procs_id();
+  // if true, server procs (logical) id starts after worker procs
+  if (cluster->server_worker_separate())
+    server_procs -= cluster->nworker_procs();
+  const vector<int> rng = cluster->ExecutorRng(server_procs,
+      cluster->nservers_per_group(), cluster->nservers_per_procs());
+  int gstart = rng[0], gend = rng[1], start = rng[2], end = rng[3];
+  for (int gid = gstart; gid < gend; gid++) {
+    for (int sid = start; sid < end; sid++) {
+      auto server = new Server(gid, sid, job_conf, slice2group, slice2server);
+      servers.push_back(server);
+    }
+  }
+  return servers;
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
index 5d2ab2f..99c91b8 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,9 +19,9 @@
 *
 *************************************************************/
 
-#include "singa.h"
+#include "./singa.h"
 /**
- * \file main.cc provides an example main func.
+ * \file main.cc provides an example main function.
  *
  * Like the main func of Hadoop, it prepares the job configuration and submit it
  * to the Driver which starts the training.
@@ -31,19 +31,17 @@
  * func must call Driver::Init at the beginning, and pass the job configuration
  * and resume option to the Driver for job submission.
  *
- * Optionally, users can register their own implemented classes, e.g., layer,
- * updater, through the registration func provided by the Driver.
+ * Optionally, users can register their own implemented subclasses of Layer,
+ * Updater, etc. through the registration function provided by the Driver.
  *
  * Users must pass at least one argument to the singa-run.sh, i.e., the job
  * configuration file which includes the cluster topology setting. Other fields
  * e.g, neuralnet, updater can be configured in main.cc.
  *
  * TODO
- * Add helper functions for users to generate their configurations easily.
- * e.g., AddLayer(layer_type, source_layers, meta_data),
- * or, MLP(layer1_size, layer2_size, tanh, loss);
+ * Add helper functions for users to generate configurations for popular models
+ * easily, e.g., MLP(layer1_size, layer2_size, tanh, loss);
  */
-
 int main(int argc, char **argv) {
   // must create driver at the beginning and call its Init method.
   singa::Driver driver;
@@ -58,7 +56,7 @@ int main(int argc, char **argv) {
   // get the job conf, and custmize it if need
   singa::JobProto jobConf = driver.job_conf();
 
-  // submit the job
-  driver.Submit(resume, jobConf);
+  // submit the job for training
+  driver.Train(resume, jobConf);
   return 0;
 }


[04/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

Add OpenBLAS installation details to FAQ of README.md.
Remove compiling flags (-O2, gdwarf-2, gstric-dwarf,
-Woverloaded-virtual) that hinder debug.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/243f2106
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/243f2106
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/243f2106

Branch: refs/heads/master
Commit: 243f21061139b2934d7dac52645209f69f80eade
Parents: 6bb1a8a
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Fri Sep 25 12:03:00 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:23:45 2015 +0800

----------------------------------------------------------------------
 Makefile.am  | 14 ++++++--------
 README.md    | 14 ++++++++++++++
 configure.ac |  2 +-
 3 files changed, 21 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/243f2106/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index 6f3be4c..3f68e29 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,15 +1,15 @@
 ACLOCAL_AMFLAGS = -I config
 AUTOMAKE_OPTIONS = foreign subdir-objects
 
-AM_CPPFLAGS = -I$(top_srcdir)/src
+#AM_CPPFLAGS = -I$(top_srcdir)/src
 
 MSHADOW_FLAGS = -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
 DEFAULT_FLAGS = -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
               $(MSHADOW_FLAGS) -DCPU_ONLY=1 -funroll-loops -DTHREADED
 
-CFLAGS = -O2 $(DEBUG)
-CXXFLAGS = -O2 $(DEBUG) 
-AC_CXXFLAGS = -O2 $(DEBUG)
+CFLAGS = $(DEBUG)
+CXXFLAGS = $(DEBUG) 
+AC_CXXFLAGS = $(DEBUG)
 
 INCLUDES = -I$(top_srcdir)/include
 
@@ -101,8 +101,7 @@ bin_PROGRAMS = singa singatool $(PROGS)
 
 #lib_LTLIBRARIES = libsinga.la
 libsinga_la_SOURCES = $(PROTO_HDRS) $(PROTO_SRCS) $(SINGA_HDRS) $(SINGA_SRCS)
-libsinga_la_CXXFLAGS = $(DEFAULT_FLAGS) -gdwarf-2 -msse3 \
-                       -gstrict-dwarf -Woverloaded-virtual -fpermissive
+libsinga_la_CXXFLAGS = $(DEFAULT_FLAGS) -msse3 -fpermissive
 if LMDB
 libsinga_la_CXXFLAGS += -DUSE_LMDB
 endif
@@ -138,8 +137,7 @@ singatool_LDFLAGS = -I./include \
 
 #lib_LTLIBRARIES += libgtest.la
 libgtest_la_SOURCES = $(GTEST_HDRS) $(GTEST_SRCS)
-libgtest_la_CXXFLAGS = $(DEFAULT_FLAGS) -gdwarf-2 -msse3 \
-					-gstrict-dwarf -Woverloaded-virtual -fpermissive
+libgtest_la_CXXFLAGS = $(DEFAULT_FLAGS) -msse3 -fpermissive
 if LMDB
 libgtest_la_CXXFLAGS += -DUSE_LMDB
 endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/243f2106/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 080f4f5..af903a9 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,8 @@ The current code depends on the following external libraries:
   * czmq (Mozilla Public License Version 2.0)
   * zookeeper (Apache 2.0)
 
+To install openblas, you need a fortran compiler.
+
 ##Documentation
 
 Full documentation is available online at [Official Documentation](https://singa.incubator.apache.org/docs/overview.html#).
@@ -142,3 +144,15 @@ google.protobuf.internal when I try to import .py files.
   check it by
 
       $ java --version
+
+* Q8: When I build OpenBLAS from source, I am told that I need a fortran compiler.
+
+  A8: Since OpenBLAS use fortran compiler to build the library, you need a compiler with fortran support. As an alternative, you can build OpenBLAS from system tools. For example, if you have APT, just run:
+	 
+	  $ sudo apt-get install openblas
+
+  or you can also run the following command if you have yum:
+
+	  $ sudo yum install openblas-devel
+
+  It is worth noting that you need root access to run the aforementioned commands.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/243f2106/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index 1ae5cec..65068ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,7 +84,7 @@ AM_CONDITIONAL(DEBUG, test "$enable_debug" = yes)
 if test x"$enable_debug" != x"no"; then
 	DEBUG='-g'
 else
-	DEBUG=''
+	DEBUG='-O2'
 fi
 AC_SUBST([DEBUG])
 


[10/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

Comment the installation instructions for protocol buffer python libs.

Update readme file to install all dependent libs into PREFIX.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/88e2ba32
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/88e2ba32
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/88e2ba32

Branch: refs/heads/master
Commit: 88e2ba32734ebae5551234bc63e613da48f83c67
Parents: 321ef96
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Sep 27 10:47:01 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Sep 27 10:47:01 2015 +0800

----------------------------------------------------------------------
 README.md             | 43 ++++++++++++++++++++++++++++++++++---------
 thirdparty/install.sh | 16 ++++++++--------
 2 files changed, 42 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/88e2ba32/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index bf7874a..5e20ded 100644
--- a/README.md
+++ b/README.md
@@ -25,11 +25,17 @@ You can install all dependencies into $PREFIX folder by
 
     ./thirdparty/install.sh all $PREFIX
 
+You can also install these libraries one by one. The usage is listed by
+
+    ./thidparty/install.sh
+
 If $PREFIX is not a system path (e.g., /usr/local/), you have to export some
 environment variables,
 
     export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
-    export CPLUS_INCLUDE_PATH=$PREFIX/include
+    export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
+    export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
+    export PATH=$PREFIX/bin:$PATH
 
 ##Documentation
 
@@ -81,15 +87,15 @@ For additional information, see the LICENSE and NOTICE files.
 ## FAQ
 
 * Q1:I get error `./configure --> cannot find blas_segmm() function` even I
-run `install.sh OpenBLAS`.
+have installed OpenBLAS.
 
-  A1: `OpenBLAS` library is installed in `/opt` folder by default or
+  A1: `OpenBLAS` library is installed in `/opt` folder by default or $PREFIX or
   other folders if you use `sudo apt-get install`.
-  You need to include the OpenBLAS library folder in the LDFLAGS, e.g.,
+  You need to export the OpenBLAS library folder, e.g.,
 
-      $ export LDFLAGS=-L/opt/OpenBLAS
-
-  Alternatively, you can include the path in LIBRARY_PATH.
+      $ export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
+      # or
+      $ export LIBRARY_PATH=/opt/OpenBLAS/lib:$LIBRARY_PATH
 
 
 * Q2: I get error `cblas.h no such file or directory exists`.
@@ -98,6 +104,8 @@ run `install.sh OpenBLAS`.
   e.g.,
 
       $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+      # or
+      $ export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
       # reconfigure and make SINGA
       $ ./configure
       $ make
@@ -158,10 +166,27 @@ google.protobuf.internal when I try to import .py files.
 
   or install it using
 
-	  $ sudo apt-get install openblas
+	    $ sudo apt-get install openblas
 
   or
 
-	  $ sudo yum install openblas-devel
+	    $ sudo yum install openblas-devel
 
   It is worth noting that you need root access to run the last two commands.
+
+* Q9: When I build protocol buffer, it reports that GLIBC++_3.4.20 not found in /usr/lib64/libstdc++.so.6.
+
+  A9: This means the linker found libstdc++.so.6 but that library
+  belongs to an older version of GCC than was used to compile and link the
+  program. The program depends on code defined in
+  the newer libstdc++ that belongs to the newer version of GCC, so the linker
+  must be told how to find the newer libstdc++ shared library.
+  The simplest way to fix this is to find the correct libstdc++ and export it to
+  LD_LIBRARY_PATH. For example, if GLIBC++_3.4.20 is listed in the output of the
+  following command,
+
+      $ strings /usr/local/lib64/libstdc++.so.6|grep GLIBC++
+
+  then you just set your environment variable as
+
+      $ export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/88e2ba32/thirdparty/install.sh
----------------------------------------------------------------------
diff --git a/thirdparty/install.sh b/thirdparty/install.sh
index 86211d6..9e7b73f 100755
--- a/thirdparty/install.sh
+++ b/thirdparty/install.sh
@@ -258,19 +258,19 @@ function install_protobuf()
 			echo "install protobuf in $1";
 			./configure --prefix=$1;
 			make && make install;
-			cd python;
-			python setup.py build;
-			python setup.py install --prefix=$1;
-			cd ..;
+			#cd python;
+			#python setup.py build;
+			#python setup.py install --prefix=$1;
+			#cd ..;
 		elif [ $# == 0 ]
 		then
 			echo "install protobuf in default path";
 			./configure;
 			make && sudo make install;
-			cd python;
-			python setup.py build;
-			sudo python setup.py install;
-			cd ..;
+			#cd python;
+			#python setup.py build;
+			#sudo python setup.py install;
+			#cd ..;
 		else
 			echo "wrong commands";
 	fi


[13/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

format driver.cc, rnnl.h, param.cc

close #69
close #65
close #63
close #39


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2f665370
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2f665370
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2f665370

Branch: refs/heads/master
Commit: 2f665370bda5ff217fb5562125567ff305ddf8af
Parents: 45e5026
Author: wang sheng <wa...@gmail.com>
Authored: Sun Sep 27 22:12:00 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Sun Sep 27 22:31:37 2015 +0800

----------------------------------------------------------------------
 examples/rnnlm/rnnlm.h | 3 +++
 src/driver.cc          | 7 +++++--
 src/utils/param.cc     | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2f665370/examples/rnnlm/rnnlm.h
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
index ad0918e..8ad7a68 100644
--- a/examples/rnnlm/rnnlm.h
+++ b/examples/rnnlm/rnnlm.h
@@ -18,8 +18,11 @@
 * under the License.
 *
 *************************************************************/
+
 #ifndef EXAMPLES_RNNLM_RNNLM_H_
 #define EXAMPLES_RNNLM_RNNLM_H_
+
+#include <string>
 #include <vector>
 #include "./singa.h"
 #include "./rnnlm.pb.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2f665370/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index d3f0f3e..28669fa 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -19,15 +19,19 @@
 *
 *************************************************************/
 
+#include "./driver.h"
+
 #include <glog/logging.h>
 #include <set>
 #include <string>
+#include <vector>
 #include "neuralnet/layer.h"
 #include "utils/common.h"
 #include "utils/tinydir.h"
 #include "utils/cluster.h"
+#include "./server.h"
 #include "./stub.h"
-#include "./driver.h"
+#include "./worker.h"
 
 extern "C" void openblas_set_num_threads(int num);
 
@@ -109,7 +113,6 @@ void Driver::Init(int argc, char **argv) {
   RegisterParamGenerator<UniformSqrtFanInOutGen>(kUniformSqrtFanInOut);
 }
 
-
 void Driver::Train(bool resume, const JobProto& job_conf) {
   Cluster::Setup(job_id_, singa_conf_, job_conf.cluster());
   if (singa_conf_.has_log_dir())

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2f665370/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 1ee4dcd..83bd818 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -118,7 +118,7 @@ const vector<int> Param::ComputeSlices(int num, const vector<Param*>& params) {
   // is assgined a sub-set of slices)
   auto param_slice = Slice(num, paramsize);
   vector<int> slices;
-  for (auto const vec: param_slice)
+  for (auto const vec : param_slice)
     for (int len : vec)
       slices.push_back(len);
   return slices;


[12/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

Polish text of README.md by Anh.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/45e50261
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/45e50261
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/45e50261

Branch: refs/heads/master
Commit: 45e50261f639c511b841c896c8a5ce9fc6298c06
Parents: 2160ab2
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Sep 27 17:48:15 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Sep 27 17:48:15 2015 +0800

----------------------------------------------------------------------
 README.md | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/45e50261/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index c8171e3..9a81402 100644
--- a/README.md
+++ b/README.md
@@ -28,10 +28,10 @@ You can install all dependencies into $PREFIX folder by
 If $PREFIX is not a system path (e.g., /usr/local/), please export the following
 variables to continue the building instructions,
 
-    export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
-    export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
-    export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
-    export PATH=$PREFIX/bin:$PATH
+    $ export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
+    $ export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
+    $ export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
+    $ export PATH=$PREFIX/bin:$PATH
 
 ##Documentation
 
@@ -39,12 +39,11 @@ Full documentation is available online at [Official Documentation](https://singa
 
 ##Building SINGA
 
-Please make sure your g++ >= 4.8.1 before building SINGA.
+Please make sure you have g++ >= 4.8.1 before building SINGA.
 
     $ ./autogen.sh (optional)
-    # pls refer to FAQ for solutions of errors
-    $ ./configure
-    $ make
+    $ ./configure #refer to the FAQs below for errors during this step, including blas_segmm() function error. 
+    $ make 	  #refer to the FAQs when encountering errors during this step. 
 
 ##Running Examples
 
@@ -87,11 +86,14 @@ For additional information, see the LICENSE and NOTICE files.
 * Q1:I get error `./configure --> cannot find blas_segmm() function` even I
 have installed OpenBLAS.
 
-  A1: This means the compiler cannot find the `OpenBLAS` library. If you installed
-  it to $PREFIX (e.g., /opt/OpenBLAS), then you need to export it as
+  A1: This means the compiler cannot find the `OpenBLAS` library. If you have installed OpenBLAS via `apt-get install`,
+  then export the path to $LD_LIBRARY_PATH (e.g. /usr/lib/openblas-base). If you installed it with
+  `./thirdparty/install.sh`, then export the corret path based on $PREFIX (e.g. /opt/OpenBLAS/lib): 
 
-      $ export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
-      # e.g.,
+      # using apt-get install for openblas	
+      $ export LIBRARY_PATH=$PATH_TO_OPENBLAS_LIB:$LIBRARY_PATH
+
+      # using ./thirdparty/install.sh for openblas:
       $ export LIBRARY_PATH=/opt/OpenBLAS/lib:$LIBRARY_PATH
 
 
@@ -163,11 +165,11 @@ google.protobuf.internal when I try to import .py files.
 
   or install it using
 
-	    $ sudo apt-get install openblas-dev
+      $ sudo apt-get install openblas-dev
 
   or
 
-	    $ sudo yum install openblas-devel
+      $ sudo yum install openblas-devel
 
   It is worth noting that you need root access to run the last two commands.
   Remember to set the environment variables to include the header and library


[11/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

* Remove lmdb from the default installation list (./install.sh all PREFIX).
* Add msse3 for rnnlm/Makefile.example.
* Tested on a virtual machine with Ubuntu 12.04 (g++ is updated to 4.8.1).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2160ab21
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2160ab21
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2160ab21

Branch: refs/heads/master
Commit: 2160ab21bd2c899369304808af4dfd4656b9d06c
Parents: 88e2ba3
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Sep 27 15:34:29 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Sep 27 15:34:29 2015 +0800

----------------------------------------------------------------------
 README.md                       | 33 ++++++++++++++++-----------------
 examples/rnnlm/Makefile.example |  2 +-
 thirdparty/install.sh           | 24 ++++++++++++------------
 3 files changed, 29 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2160ab21/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 5e20ded..c8171e3 100644
--- a/README.md
+++ b/README.md
@@ -25,12 +25,8 @@ You can install all dependencies into $PREFIX folder by
 
     ./thirdparty/install.sh all $PREFIX
 
-You can also install these libraries one by one. The usage is listed by
-
-    ./thidparty/install.sh
-
-If $PREFIX is not a system path (e.g., /usr/local/), you have to export some
-environment variables,
+If $PREFIX is not a system path (e.g., /usr/local/), please export the following
+variables to continue the building instructions,
 
     export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
     export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
@@ -43,6 +39,8 @@ Full documentation is available online at [Official Documentation](https://singa
 
 ##Building SINGA
 
+Please make sure your g++ >= 4.8.1 before building SINGA.
+
     $ ./autogen.sh (optional)
     # pls refer to FAQ for solutions of errors
     $ ./configure
@@ -62,10 +60,10 @@ First, download the dataset and create data shards:
     $ make download
     $ make create
 
-If it reports errors due to libopenblas.so missing, then include the
-lib folder of OpenBLAS in LD_LIBRARY_PATH
+If it reports errors due to library missing, e.g., libopenblas.so or libprotobuf,
+please export the environment variables shown in the Dependencies section and
+continue with the following instructions,
 
-    $ export LD_LIBRARY_PATH=$OPENBLAS_FOLDER/lib:$LD_LIBRARY_PATH
     # delete the newly created folders
     $ rm -rf cifar10_t*
     $ make create
@@ -89,12 +87,11 @@ For additional information, see the LICENSE and NOTICE files.
 * Q1:I get error `./configure --> cannot find blas_segmm() function` even I
 have installed OpenBLAS.
 
-  A1: `OpenBLAS` library is installed in `/opt` folder by default or $PREFIX or
-  other folders if you use `sudo apt-get install`.
-  You need to export the OpenBLAS library folder, e.g.,
+  A1: This means the compiler cannot find the `OpenBLAS` library. If you installed
+  it to $PREFIX (e.g., /opt/OpenBLAS), then you need to export it as
 
       $ export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
-      # or
+      # e.g.,
       $ export LIBRARY_PATH=/opt/OpenBLAS/lib:$LIBRARY_PATH
 
 
@@ -103,10 +100,10 @@ have installed OpenBLAS.
   Q2: You need to include the folder of the cblas.h into CPLUS_INCLUDE_PATH,
   e.g.,
 
-      $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
-      # or
       $ export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
-      # reconfigure and make SINGA
+      # e.g.,
+      $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+      # then reconfigure and make SINGA
       $ ./configure
       $ make
 
@@ -166,13 +163,15 @@ google.protobuf.internal when I try to import .py files.
 
   or install it using
 
-	    $ sudo apt-get install openblas
+	    $ sudo apt-get install openblas-dev
 
   or
 
 	    $ sudo yum install openblas-devel
 
   It is worth noting that you need root access to run the last two commands.
+  Remember to set the environment variables to include the header and library
+  paths of OpenBLAS after installation (please refer to the Dependencies section).
 
 * Q9: When I build protocol buffer, it reports that GLIBC++_3.4.20 not found in /usr/lib64/libstdc++.so.6.
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2160ab21/examples/rnnlm/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example
index 0e2333f..48efd17 100644
--- a/examples/rnnlm/Makefile.example
+++ b/examples/rnnlm/Makefile.example
@@ -47,6 +47,6 @@ create:
 
 rnnlm:
 	protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto
-	$(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include -I../../include/proto \
+	$(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -msse3 -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include -I../../include/proto \
 		-L../../.libs/ -L/usr/local  -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/\
 		-o rnnlm.bin

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2160ab21/thirdparty/install.sh
----------------------------------------------------------------------
diff --git a/thirdparty/install.sh b/thirdparty/install.sh
index 9e7b73f..99403b9 100755
--- a/thirdparty/install.sh
+++ b/thirdparty/install.sh
@@ -607,12 +607,12 @@ do
 		        echo "ERROR during glog installation" ;
 		        exit;
 		    fi
-			install_lmdb $2;
-		    if [ $? -ne 0 ]
-		    then
-		        echo "ERROR during lmdb installation" ;
-		        exit;
-		    fi
+#			install_lmdb $2;
+#		    if [ $? -ne 0 ]
+#		    then
+#		        echo "ERROR during lmdb installation" ;
+#		        exit;
+#		    fi
 			install_openblas $2;
 		    if [ $? -ne 0 ]
 		    then
@@ -664,12 +664,12 @@ do
 		        echo "ERROR during glog installation" ;
 		        exit;
 		    fi
-			install_lmdb;
-		    if [ $? -ne 0 ]
-		    then
-		        echo "ERROR during lmdb installation" ;
-		        exit;
-		    fi
+#			install_lmdb;
+#		    if [ $? -ne 0 ]
+#		    then
+#		        echo "ERROR during lmdb installation" ;
+#		        exit;
+#		    fi
 			install_openblas;
 		    if [ $? -ne 0 ]
 		    then


[02/13] incubator-singa git commit: SINGA-72 Minor updates to be consisten with documentation

Posted by wa...@apache.org.
SINGA-72 Minor updates to be consisten with documentation

Update configuration files (e.g., parameter name, file name) for RBM example to be consistent with online
documentation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8e7c6cc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8e7c6cc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8e7c6cc1

Branch: refs/heads/master
Commit: 8e7c6cc18b21e3151fecba605c6e240a3f05aeff
Parents: 4e15c34
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Sep 20 17:49:40 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Sep 26 23:23:33 2015 +0800

----------------------------------------------------------------------
 examples/rbm/autoencoder.conf |  80 ++++++++---------
 examples/rbm/rbm0.conf        | 107 -----------------------
 examples/rbm/rbm1.conf        |  49 +++--------
 examples/rbm/rbm2.conf        |  79 ++++++-----------
 examples/rbm/rbm3.conf        |  79 +++++++----------
 examples/rbm/rbm4.conf        | 170 +++++++++++++++++++++++++++++++++++++
 6 files changed, 282 insertions(+), 282 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/autoencoder.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
index bc32cc7..29f7729 100644
--- a/examples/rbm/autoencoder.conf
+++ b/examples/rbm/autoencoder.conf
@@ -3,10 +3,10 @@ train_steps: 12200
 test_steps:100
 test_freq:1000
 disp_freq:100
-checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin"
 checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
 checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
 checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm4/checkpoint/step6000-worker0.bin"
 train_one_batch{
   alg: kBP
 }
@@ -56,7 +56,7 @@ neuralnet {
   }
 
   layer{
-    name: "fc1"
+    name: "Inner1"
     type: kInnerProduct
     srclayers:"mnist"
     innerproduct_conf{
@@ -66,19 +66,19 @@ neuralnet {
       name: "w1"
     }
     param{
-      name: "rb12"
+      name: "b12"
     }
   }
 
   layer{
-    name: "sigmoid1"
+    name: "Sigmoid1"
     type: kSigmoid
-    srclayers:"fc1"
+    srclayers:"Inner1"
   }
   layer{
-    name: "fc2"
+    name: "Inner2"
     type: kInnerProduct
-    srclayers:"sigmoid1"
+    srclayers:"Sigmoid1"
     innerproduct_conf{
       num_output: 500
     }
@@ -86,20 +86,20 @@ neuralnet {
       name: "w2"
     }
     param{
-      name: "rb22"
+      name: "b22"
     }
   }
 
   layer{
-    name: "sigmoid2"
+    name: "Sigmoid2"
     type: kSigmoid
-    srclayers:"fc2"
+    srclayers:"Inner2"
   }
 
   layer{
-    name: "fc3"
+    name: "Inner3"
     type:  kInnerProduct
-    srclayers:"sigmoid2"
+    srclayers:"Sigmoid2"
     innerproduct_conf{
       num_output: 250
     }
@@ -107,20 +107,20 @@ neuralnet {
       name: "w3"
     }
     param{
-      name: "rb32"
+      name: "b32"
     }
   }
 
   layer{
-    name: "sigmoid3"
+    name: "Sigmoid3"
     type: kSigmoid
-    srclayers:"fc3"
+    srclayers:"Inner3"
   }
 
   layer{
-    name: "fc4"
+    name: "Inner4"
     type: kInnerProduct
-    srclayers:"sigmoid3"
+    srclayers:"Sigmoid3"
     innerproduct_conf{
       num_output: 30
     }
@@ -128,16 +128,16 @@ neuralnet {
       name: "w4"
     }
     param{
-      name: "rb42"
+      name: "b42"
 
     }
   }
 
   layer{
-    name: "fc5"
+    name: "Inner5"
     type: kInnerProduct
-    #srclayers:"sigmoid4"
-    srclayers:"fc4"
+    #srclayers:"Sigmoid4"
+    srclayers:"Inner4"
     innerproduct_conf{
       num_output: 250
       transpose: true
@@ -147,19 +147,19 @@ neuralnet {
       share_from: "w4"
     }
     param{
-      name: "rb41"
+      name: "b41"
     }
   }
 
   layer{
-    name: "sigmoid5"
+    name: "Sigmoid5"
     type: kSigmoid
-    srclayers:"fc5"
+    srclayers:"Inner5"
   }
   layer{
-    name: "fc6"
+    name: "Inner6"
     type: kInnerProduct
-    srclayers:"sigmoid5"
+    srclayers:"Sigmoid5"
     innerproduct_conf{
       num_output: 500
       transpose: true
@@ -169,19 +169,19 @@ neuralnet {
       share_from: "w3"
     }
     param{
-      name: "rb31"
+      name: "b31"
     }
   }
 
   layer{
-    name: "sigmoid6"
+    name: "Sigmoid6"
     type: kSigmoid
-    srclayers:"fc6"
+    srclayers:"Inner6"
   }
  layer{
-    name: "fc7"
+    name: "Inner7"
     type: kInnerProduct
-    srclayers:"sigmoid6"
+    srclayers:"Sigmoid6"
     innerproduct_conf{
       num_output: 1000
       transpose: true
@@ -191,20 +191,20 @@ neuralnet {
       share_from: "w2"
     }
     param{
-      name: "rb21"
+      name: "b21"
     }
 
   }
 
   layer{
-    name: "sigmoid7"
+    name: "Sigmoid7"
     type: kSigmoid
-    srclayers:"fc7"
+    srclayers:"Inner7"
   }
  layer{
-    name: "fc8"
+    name: "Inner8"
     type: kInnerProduct
-    srclayers:"sigmoid7"
+    srclayers:"Sigmoid7"
     innerproduct_conf{
       num_output: 784
       transpose: true
@@ -214,20 +214,20 @@ neuralnet {
       share_from: "w1"
     }
     param{
-      name: "rb11"
+      name: "b11"
     }
   }
 
   layer{
-    name: "sigmoid8"
+    name: "Sigmoid8"
     type: kSigmoid
-    srclayers:"fc8"
+    srclayers:"Inner8"
   }
 
   layer{
     name: "loss"
     type:kEuclideanLoss
-    srclayers:"sigmoid8"
+    srclayers:"Sigmoid8"
     srclayers:"mnist"
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/rbm0.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm0.conf b/examples/rbm/rbm0.conf
deleted file mode 100644
index dba4f37..0000000
--- a/examples/rbm/rbm0.conf
+++ /dev/null
@@ -1,107 +0,0 @@
-name: "rbm0"
-train_steps: 6000
-test_steps:100
-test_freq:500
-disp_freq: 100
-train_one_batch{
-  alg: kCD
-}
-updater{
-  type: kSGD
-  momentum: 0.8
-  weight_decay: 0.0002
-  learning_rate{
-    base_lr: 0.1
-    type: kFixed
-  }
-}
-
-neuralnet {
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_train_shard"
-    batchsize: 100
-  }
-  exclude: kTest
-}
-
-
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_test_shard"
-    batchsize: 100
-  }
-  exclude: kTrain
-}
-
-
-layer{
-  name:"mnist"
-  type: kMnist
-  srclayers: "data"
-  mnist_conf {
-    norm_a: 255
-    norm_b: 0
-  }
-}
-
-layer{
-  name: "RBMVis"
-  type: kRBMVis
-  srclayers:"mnist"
-  srclayers:"RBMHid"
-  rbm_conf{
-    hdim: 1000
-  }
-  param{
-    name: "w1"
-    init{
-      type: kGaussian
-      mean: 0.0
-      std: 0.1
-    }
-  }
-
-  param{
-    name: "rb11"
-    wd_scale: 0
-    init{
-      type: kConstant
-      value: 0.0
-    }
-  }
-}
-
-layer{
-  name: "RBMHid"
-  type: kRBMHid
-  srclayers:"RBMVis"
-  rbm_conf{
-    hdim: 1000
-  }
-  param{
-    name: "w1_"
-    share_from: "w1"
-  }
-
-  param{
-    name: "rb12"
-    wd_scale: 0
-    init{
-      type: kConstant
-      value: 0.0
-    }
-  }
-}
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/rbm/rbm0/"
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/rbm1.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf
index ac6c059..d185766 100644
--- a/examples/rbm/rbm1.conf
+++ b/examples/rbm/rbm1.conf
@@ -6,14 +6,13 @@ disp_freq: 100
 train_one_batch{
   alg: kCD
 }
-checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin"
 updater{
   type: kSGD
   momentum: 0.8
   weight_decay: 0.0002
   learning_rate{
-  base_lr: 0.1
-  type: kFixed
+    base_lr: 0.1
+    type: kFixed
   }
 }
 
@@ -51,36 +50,15 @@ layer{
 }
 
 layer{
-  name: "fc1"
-  type: kInnerProduct
-  srclayers:"mnist"
-  innerproduct_conf{
-    num_output: 1000
-  }
-  param{
-    name: "w1"
-  }
-  param{
-    name: "rb12"
-  }
-}
-
-layer{
-  name: "sigmoid1"
-  type: kSigmoid
-  srclayers:"fc1"
-}
-
-layer{
   name: "RBMVis"
   type: kRBMVis
-  srclayers:"sigmoid1"
+  srclayers:"mnist"
   srclayers:"RBMHid"
   rbm_conf{
-    hdim: 500
+    hdim: 1000
   }
   param{
-    name: "w2"
+    name: "w1"
     init{
       type: kGaussian
       mean: 0.0
@@ -89,11 +67,11 @@ layer{
   }
 
   param{
-    name: "rb21"
+    name: "b11"
     wd_scale: 0
     init{
-    type: kConstant
-    value: 0.0
+      type: kConstant
+      value: 0.0
     }
   }
 }
@@ -103,14 +81,15 @@ layer{
   type: kRBMHid
   srclayers:"RBMVis"
   rbm_conf{
-    hdim: 500
+    hdim: 1000
   }
   param{
-    name: "w2_"
-    share_from: "w2"
+    name: "w1_"
+    share_from: "w1"
   }
- param{
-    name: "rb22"
+
+  param{
+    name: "b12"
     wd_scale: 0
     init{
       type: kConstant

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/rbm2.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
index 96841ff..8a16e0f 100644
--- a/examples/rbm/rbm2.conf
+++ b/examples/rbm/rbm2.conf
@@ -7,18 +7,16 @@ train_one_batch{
   alg: kCD
 }
 checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
-
 updater{
   type: kSGD
   momentum: 0.8
   weight_decay: 0.0002
   learning_rate{
-    base_lr: 0.1
-    type: kFixed
+  base_lr: 0.1
+  type: kFixed
   }
 }
 
-
 neuralnet {
 layer {
   name: "data"
@@ -53,56 +51,36 @@ layer{
 }
 
 layer{
-    name: "fc1"
-    type: kInnerProduct
-    srclayers:"mnist"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w1"
-    }
-    param{
-      name: "rb12"
-    }
+  name: "Inner1"
+  type: kInnerProduct
+  srclayers:"mnist"
+  innerproduct_conf{
+    num_output: 1000
   }
-
-  layer{
-    name: "sigmoid1"
-    type: kSigmoid
-    srclayers:"fc1"
+  param{
+    name: "w1"
   }
+  param{
+    name: "b12"
+  }
+}
 
 layer{
-    name: "fc2"
-    type: kInnerProduct
-    srclayers:"sigmoid1"
-    innerproduct_conf{
-      num_output: 500
-    }
-    param{
-      name: "w2"
-    }
-    param{
-      name: "rb22"
-    }
-  }
+  name: "Sigmoid1"
+  type: kSigmoid
+  srclayers:"Inner1"
+}
 
-  layer{
-    name: "sigmoid2"
-    type: kSigmoid
-    srclayers:"fc2"
-  }
 layer{
   name: "RBMVis"
   type: kRBMVis
-  srclayers:"sigmoid2"
+  srclayers:"Sigmoid1"
   srclayers:"RBMHid"
   rbm_conf{
-    hdim: 250
+    hdim: 500
   }
   param{
-    name: "w3"
+    name: "w2"
     init{
       type: kGaussian
       mean: 0.0
@@ -111,7 +89,7 @@ layer{
   }
 
   param{
-    name: "rb31"
+    name: "b21"
     wd_scale: 0
     init{
     type: kConstant
@@ -125,19 +103,18 @@ layer{
   type: kRBMHid
   srclayers:"RBMVis"
   rbm_conf{
-    hdim: 250
+    hdim: 500
   }
   param{
-    name: "w3_"
-    share_from: "w3"
+    name: "w2_"
+    share_from: "w2"
   }
-
-  param{
-    name: "rb32"
+ param{
+    name: "b22"
     wd_scale: 0
     init{
-    type: kConstant
-    value: 0.0
+      type: kConstant
+      value: 0.0
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/rbm3.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
index fa60114..75848d6 100644
--- a/examples/rbm/rbm3.conf
+++ b/examples/rbm/rbm3.conf
@@ -1,22 +1,24 @@
 name: "rbm3"
 train_steps: 6000
-test_steps: 100
-test_freq: 500
+test_steps:100
+test_freq:500
 disp_freq: 100
 train_one_batch{
   alg: kCD
 }
 checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
+
 updater{
-    type: kSGD
-    momentum: 0.8
-    weight_decay: 0.0002
-    learning_rate{
-      base_lr: 0.001
-      type: kFixed
-    }
+  type: kSGD
+  momentum: 0.8
+  weight_decay: 0.0002
+  learning_rate{
+    base_lr: 0.1
+    type: kFixed
+  }
 }
 
+
 neuralnet {
 layer {
   name: "data"
@@ -51,7 +53,7 @@ layer{
 }
 
 layer{
-    name: "fc1"
+    name: "Inner1"
     type: kInnerProduct
     srclayers:"mnist"
     innerproduct_conf{
@@ -61,20 +63,20 @@ layer{
       name: "w1"
     }
     param{
-      name: "rb12"
+      name: "b12"
     }
   }
 
   layer{
-    name: "sigmoid1"
+    name: "Sigmoid1"
     type: kSigmoid
-    srclayers:"fc1"
+    srclayers:"Inner1"
   }
 
 layer{
-    name: "fc2"
+    name: "Inner2"
     type: kInnerProduct
-    srclayers:"sigmoid1"
+    srclayers:"Sigmoid1"
     innerproduct_conf{
       num_output: 500
     }
@@ -82,55 +84,34 @@ layer{
       name: "w2"
     }
     param{
-      name: "rb22"
+      name: "b22"
     }
   }
 
   layer{
-    name: "sigmoid2"
+    name: "Sigmoid2"
     type: kSigmoid
-    srclayers:"fc2"
+    srclayers:"Inner2"
   }
-
-layer{
-    name: "fc3"
-    type: kInnerProduct
-    srclayers:"sigmoid2"
-    innerproduct_conf{
-      num_output: 250
-    }
-    param{
-      name: "w3"
-    }
-    param{
-      name: "rb32"
-    }
-  }
-
-  layer{
-    name: "sigmoid3"
-    type: kSigmoid
-    srclayers:"fc3"
-  }
-
 layer{
   name: "RBMVis"
   type: kRBMVis
-  srclayers:"sigmoid3"
+  srclayers:"Sigmoid2"
   srclayers:"RBMHid"
   rbm_conf{
-    hdim: 30
+    hdim: 250
   }
   param{
-    name: "w4"
+    name: "w3"
     init{
       type: kGaussian
       mean: 0.0
       std: 0.1
     }
   }
+
   param{
-    name: "rb41"
+    name: "b31"
     wd_scale: 0
     init{
     type: kConstant
@@ -144,15 +125,15 @@ layer{
   type: kRBMHid
   srclayers:"RBMVis"
   rbm_conf{
-    hdim: 30
-    gaussian: true
+    hdim: 250
   }
   param{
-    name: "w4_"
-    share_from: "w4"
+    name: "w3_"
+    share_from: "w3"
   }
+
   param{
-    name: "rb42"
+    name: "b32"
     wd_scale: 0
     init{
     type: kConstant

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e7c6cc1/examples/rbm/rbm4.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm4.conf b/examples/rbm/rbm4.conf
new file mode 100644
index 0000000..2b83afb
--- /dev/null
+++ b/examples/rbm/rbm4.conf
@@ -0,0 +1,170 @@
+name: "rbm4"
+train_steps: 6000
+test_steps: 100
+test_freq: 500
+disp_freq: 100
+train_one_batch{
+  alg: kCD
+}
+checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin"
+updater{
+    type: kSGD
+    momentum: 0.8
+    weight_decay: 0.0002
+    learning_rate{
+      base_lr: 0.001
+      type: kFixed
+    }
+}
+
+neuralnet {
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/mnist/mnist_train_shard"
+    batchsize: 100
+  }
+  exclude: kTest
+}
+
+
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/mnist/mnist_test_shard"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+
+layer{
+  name:"mnist"
+  type: kMnist
+  srclayers: "data"
+  mnist_conf {
+    norm_a: 255
+    norm_b: 0
+  }
+}
+
+layer{
+    name: "Inner1"
+    type: kInnerProduct
+    srclayers:"mnist"
+    innerproduct_conf{
+      num_output: 1000
+    }
+    param{
+      name: "w1"
+    }
+    param{
+      name: "b12"
+    }
+  }
+
+  layer{
+    name: "Sigmoid1"
+    type: kSigmoid
+    srclayers:"Inner1"
+  }
+
+layer{
+    name: "Inner2"
+    type: kInnerProduct
+    srclayers:"Sigmoid1"
+    innerproduct_conf{
+      num_output: 500
+    }
+    param{
+      name: "w2"
+    }
+    param{
+      name: "b22"
+    }
+  }
+
+  layer{
+    name: "Sigmoid2"
+    type: kSigmoid
+    srclayers:"Inner2"
+  }
+
+layer{
+    name: "Inner3"
+    type: kInnerProduct
+    srclayers:"Sigmoid2"
+    innerproduct_conf{
+      num_output: 250
+    }
+    param{
+      name: "w3"
+    }
+    param{
+      name: "b32"
+    }
+  }
+
+  layer{
+    name: "Sigmoid3"
+    type: kSigmoid
+    srclayers:"Inner3"
+  }
+
+layer{
+  name: "RBMVis"
+  type: kRBMVis
+  srclayers:"Sigmoid3"
+  srclayers:"RBMHid"
+  rbm_conf{
+    hdim: 30
+  }
+  param{
+    name: "w4"
+    init{
+      type: kGaussian
+      mean: 0.0
+      std: 0.1
+    }
+  }
+  param{
+    name: "b41"
+    wd_scale: 0
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+
+layer{
+  name: "RBMHid"
+  type: kRBMHid
+  srclayers:"RBMVis"
+  rbm_conf{
+    hdim: 30
+    gaussian: true
+  }
+  param{
+    name: "w4_"
+    share_from: "w4"
+  }
+  param{
+    name: "b42"
+    wd_scale: 0
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nservers_per_group: 1
+  nworkers_per_group: 1
+  workspace: "examples/rbm/rbm4/"
+}


[07/13] incubator-singa git commit: SINGA-70 Refactor API of Layer, Worker, Server and Driver

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/connection_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer.cc b/src/neuralnet/connection_layer.cc
index 1ba2d95..acf243d 100644
--- a/src/neuralnet/connection_layer.cc
+++ b/src/neuralnet/connection_layer.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,17 +24,24 @@
 namespace singa {
 
 using std::vector;
-
+/********* Implementation for BridgeDstLayer **************/
+void BridgeDstLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(proto, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.ReshapeLike(data_);
+}
 /************* Implementation for ConcateLayer ***********/
-void ConcateLayer::Setup(const LayerProto& proto, int npartitions) {
-  // CHECK_EQ(npartitions, 1);
-  Layer::Setup(proto, npartitions);
-  size_t concate_dim = proto.concate_conf().concate_dim();
+void ConcateLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  size_t concate_dim = conf.concate_conf().concate_dim();
   CHECK_GE(concate_dim, 0);
-  CHECK_GT(srclayers_.size(), 1);
-  vector<int> shape = srclayers_[0]->data(this).shape();
-  for (size_t i = 1; i < srclayers_.size(); i++) {
-    const vector<int>& srcshape = srclayers_[i]->data(this).shape();
+  CHECK_GT(srclayers.size(), 1);
+  vector<int> shape = srclayers[0]->data(this).shape();
+  for (size_t i = 1; i < srclayers.size(); i++) {
+    const vector<int>& srcshape = srclayers[i]->data(this).shape();
     for (size_t j = 0; j < shape.size(); j++)
       if (j == concate_dim)
         shape[j] += srcshape[j];
@@ -45,23 +52,24 @@ void ConcateLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.Reshape(shape);
 }
 
-void ConcateLayer::ComputeFeature(int flag, Metric *perf) {
+void ConcateLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented for Concate Layer";
 }
 
-void ConcateLayer::ComputeGradient(int flag, Metric* perf) {
+void ConcateLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented for Concate Layer";
 }
 
 /************* Implementation for SliceLayer****************/
-void SliceLayer::Setup(const LayerProto& proto, int npartitions) {
+void SliceLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
   /*
-  Layer::Setup(proto, npartitions);
-  slice_dim_ = proto.slice_conf().slice_dim();
+  Layer::Setup(conf, npartitions);
+  slice_dim_ = conf.slice_conf().slice_dim();
   slice_num_ = npartitions;
   CHECK_GE(slice_dim_, 0);
   CHECK_EQ(slice_num_, dstlayers_.size());
-  data_.Reshape(srclayers_[0]->data(this).shape());
+  data_.Reshape(srclayers[0]->data(this).shape());
   grad_.ReshapeLike(data_);
   datavec_.resize(slice_num_);
   gradvec_.resize(slice_num_);
@@ -79,11 +87,11 @@ void SliceLayer::Setup(const LayerProto& proto, int npartitions) {
   LOG(FATAL) << "Not implemented";
 }
 
-void SliceLayer::ComputeFeature(int flag, Metric *perf) {
+void SliceLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   /*
-  CHECK_EQ(srclayers_.size(), 1);
+  CHECK_EQ(srclayers.size(), 1);
   if (slice_dim_ == 0) {
-    const auto& blob = srclayers_.at(0)->data(this);
+    const auto& blob = srclayers.at(0)->data(this);
     int size = blob.count() / slice_num_;
     for (int i = 0; i < slice_num_; i++) {
       float* dst = datavec_[i].mutable_cpu_data();
@@ -95,7 +103,7 @@ void SliceLayer::ComputeFeature(int flag, Metric *perf) {
   LOG(FATAL) << "Not implemented";
 }
 
-void SliceLayer::ComputeGradient(int flag, Metric* perf) {
+void SliceLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented";
 }
 
@@ -112,19 +120,19 @@ int SliceLayer::SliceID(const Layer* layer) const {
 }*/
 
 /************* Implementation for SplitLayer****************/
-void SplitLayer::Setup(const LayerProto& proto, int npartitions) {
-  // CHECK_EQ(npartitions, 1);
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  data_.Reshape(srclayers_[0]->data(this).shape());
-  grad_.Reshape(srclayers_[0]->data(this).shape());
+void SplitLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.Reshape(srclayers[0]->data(this).shape());
 }
 
-void SplitLayer::ComputeFeature(int flag, Metric *perf) {
+void SplitLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented";
 }
 
-void SplitLayer::ComputeGradient(int flag, Metric* perf) {
+void SplitLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented";
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/input_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer.cc b/src/neuralnet/input_layer.cc
index a608ba4..f89369c 100644
--- a/src/neuralnet/input_layer.cc
+++ b/src/neuralnet/input_layer.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -34,9 +34,9 @@ using std::string;
 using std::vector;
 
 /************* Implementation for ParserLayer ***********/
-void ParserLayer::ComputeFeature(int flag, Metric *perf) {
-  CHECK_EQ(srclayers_.size(), 1);
-  auto datalayer = dynamic_cast<DataLayer*>(*srclayers_.begin());
+void ParserLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 1);
+  auto datalayer = dynamic_cast<DataLayer*>(*srclayers.begin());
   ParseRecords(flag, datalayer->records(), &data_);
 }
 
@@ -48,8 +48,9 @@ LMDBDataLayer::~LMDBDataLayer() {
   mdb_cursor_ = nullptr;
 }
 
-void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
+void LMDBDataLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(proto, srclayers);
   OpenLMDB(proto.lmdbdata_conf().path());
   CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT),
            MDB_SUCCESS);
@@ -62,7 +63,7 @@ void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
   ConvertCaffeDatumToRecord(datum, record);
   batchsize_ = proto.lmdbdata_conf().batchsize();
   if (partition_dim() == 0)
-    batchsize_ /= npartitions;
+    batchsize_ /= proto.num_partitions();
   records_.resize(batchsize_);
   random_skip_ = proto.lmdbdata_conf().random_skip();
 }
@@ -83,9 +84,9 @@ void LMDBDataLayer::OpenLMDB(const std::string& path) {
            MDB_SUCCESS) << "mdb_cursor_get failed";
 }
 
-void LMDBDataLayer::ComputeFeature(int flag, Metric* perf) {
+void LMDBDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (mdb_cursor_ == nullptr)
-    OpenLMDB(layer_proto_.lmdbdata_conf().path());
+    OpenLMDB(layer_conf_.lmdbdata_conf().path());
   if (random_skip_) {
     int nskip = rand() % random_skip_;
     int n = 0;
@@ -155,8 +156,9 @@ ShardDataLayer::~ShardDataLayer() {
   shard_ = nullptr;
 }
 
-void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
+void ShardDataLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(proto, srclayers);
   shard_ = new DataShard(proto.sharddata_conf().path(), DataShard::kRead);
   string key;
   shard_->Next(&key, &sample_);
@@ -164,14 +166,14 @@ void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
   shard_ = nullptr;
   batchsize_ = proto.sharddata_conf().batchsize();
   if (partition_dim() == 0)
-    batchsize_ /= npartitions;
+    batchsize_ /= proto.num_partitions();
   records_.resize(batchsize_);
   random_skip_ = proto.sharddata_conf().random_skip();
 }
 
-void ShardDataLayer::ComputeFeature(int flag, Metric* perf) {
+void ShardDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (shard_ == nullptr)
-    shard_ = new DataShard(layer_proto_.sharddata_conf().path(),
+    shard_ = new DataShard(layer_conf_.sharddata_conf().path(),
                            DataShard::kRead);
   if (random_skip_) {
     int nskip = rand() % random_skip_;
@@ -193,15 +195,16 @@ void ShardDataLayer::ComputeFeature(int flag, Metric* perf) {
 }
 
 /********* Implementation for LabelLayer **************/
-void LabelLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  int batchsize = dynamic_cast<DataLayer*>(srclayers_[0])->batchsize();
+void LabelLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(proto, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
   data_.Reshape(vector<int>{batchsize});
 }
 
 void LabelLayer::ParseRecords(int flag, const vector<Record>& records,
-                              Blob<float>* blob) {
+    Blob<float>* blob) {
   int rid = 0;
   float *label = blob->mutable_cpu_data();
   for (const Record& record : records) {
@@ -212,8 +215,8 @@ void LabelLayer::ParseRecords(int flag, const vector<Record>& records,
 }
 
 /**************** Implementation for MnistLayer ******************/
-void MnistLayer::ParseRecords(int flag,
-    const vector<Record>& records, Blob<float>* blob) {
+void MnistLayer::ParseRecords(int flag, const vector<Record>& records,
+    Blob<float>* blob) {
   LOG_IF(ERROR, records.size() == 0) << "Empty records to parse";
   int ndim = records.at(0).image().shape_size();
   int inputsize = records.at(0).image().shape(ndim-1);
@@ -246,11 +249,12 @@ void MnistLayer::ParseRecords(int flag,
   CHECK_EQ(dptr, blob->mutable_cpu_data() + blob->count());
 }
 
-void MnistLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  int batchsize = dynamic_cast<DataLayer*>(srclayers_[0])->batchsize();
-  Record sample = dynamic_cast<DataLayer*>(srclayers_[0])->sample();
+void MnistLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(proto, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
+  Record sample = dynamic_cast<DataLayer*>(srclayers[0])->sample();
   norm_a_ = proto.mnist_conf().norm_a();
   norm_b_ = proto.mnist_conf().norm_b();
   int ndim = sample.image().shape_size();
@@ -261,8 +265,8 @@ void MnistLayer::Setup(const LayerProto& proto, int npartitions) {
 }
 
 /*************** Implementation for RGBImageLayer *************************/
-void RGBImageLayer::ParseRecords(int flag,
-    const vector<Record>& records, Blob<float>* blob) {
+void RGBImageLayer::ParseRecords(int flag, const vector<Record>& records,
+    Blob<float>* blob) {
   const vector<int>& s = blob->shape();
   Tensor<cpu, 4> images(data_.mutable_cpu_data(),
       Shape4(s[0], s[1], s[2], s[3]));
@@ -315,14 +319,15 @@ void RGBImageLayer::ParseRecords(int flag,
     FreeSpace(croped_image);
 }
 
-void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) {
-  ParserLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
+void RGBImageLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  ParserLayer::Setup(proto, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
   scale_ = proto.rgbimage_conf().scale();
   cropsize_ = proto.rgbimage_conf().cropsize();
   mirror_ = proto.rgbimage_conf().mirror();
-  int batchsize = dynamic_cast<DataLayer*>(srclayers_[0])->batchsize();
-  Record sample = dynamic_cast<DataLayer*>(srclayers_[0])->sample();
+  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
+  Record sample = dynamic_cast<DataLayer*>(srclayers[0])->sample();
   vector<int> shape;
   shape.push_back(batchsize);
   for (int x : sample.image().shape()) {
@@ -361,7 +366,7 @@ PrefetchLayer::~PrefetchLayer() {
 }
 
 
-void PrefetchLayer::ComputeFeature(int flag, Metric* perf) {
+void PrefetchLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   LOG(FATAL) << "Not implemented";
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index d818533..e229045 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -42,7 +42,9 @@ Layer* Layer::Create(const LayerProto& proto) {
   return layer;
 }
 
-const string Layer::DebugString(int step, int flag) {
+const std::string Layer::ToString(bool debug, int flag) {
+  if (!debug)
+    return "";
   string ret = StringPrintf("Layer %10s ", name().c_str());
   if ((flag & kForward) == kForward && data_.count() !=0) {
     ret += StringPrintf("data norm1 %13.9f", data_.asum_data());
@@ -60,4 +62,15 @@ const string Layer::DebugString(int step, int flag) {
   }
   return ret;
 }
+
+const std::string LossLayer::ToString(bool debug, int flag) {
+  std::string disp;
+  if (debug) {
+    disp = Layer::ToString(debug, flag);
+  } else {
+    disp = metric_.ToLogString();
+    metric_.Reset();
+  }
+  return disp;
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/loss_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer.cc b/src/neuralnet/loss_layer.cc
index d8fd92b..b5447f6 100644
--- a/src/neuralnet/loss_layer.cc
+++ b/src/neuralnet/loss_layer.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -40,50 +40,59 @@ using std::string;
 using std::vector;
 
 /********** * Implementation for EuclideanLossLayer*************************/
-void EuclideanLossLayer::ComputeFeature(int flag, Metric* perf) {
-  int count = srclayers_[0]->data(this).count();
-  CHECK_EQ(count, srclayers_[1]->data(this).count());
-  const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
-  const float* input_dptr = srclayers_[1]->data(this).cpu_data();
+void EuclideanLossLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 2);
+  Layer::Setup(conf, srclayers);
+}
+
+void EuclideanLossLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  int count = srclayers[0]->data(this).count();
+  CHECK_EQ(count, srclayers[1]->data(this).count());
+  const float* reconstruct_dptr = srclayers[0]->data(this).cpu_data();
+  const float* input_dptr = srclayers[1]->data(this).cpu_data();
   float loss = 0;
   for (int i = 0; i < count; i++) {
       loss += (input_dptr[i] - reconstruct_dptr[i]) *
         (input_dptr[i] - reconstruct_dptr[i]);
   }
-  perf->Add("loss", loss / srclayers_[0]->data(this).shape()[0]);
+  metric_.Add("loss", loss / srclayers[0]->data(this).shape()[0]);
 }
-void EuclideanLossLayer::ComputeGradient(int flag, Metric* perf) {
-  int count = srclayers_[0]->data(this).count();
-  CHECK_EQ(count, srclayers_[1]->data(this).count());
-  const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
-  const float* input_dptr = srclayers_[1]->data(this).cpu_data();
-  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
+
+void EuclideanLossLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  int count = srclayers[0]->data(this).count();
+  CHECK_EQ(count, srclayers[1]->data(this).count());
+  const float* reconstruct_dptr = srclayers[0]->data(this).cpu_data();
+  const float* input_dptr = srclayers[1]->data(this).cpu_data();
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
   float* gsrcptr = gsrcblob->mutable_cpu_data();
   for (int i = 0; i < count; i++) {
     gsrcptr[i] = reconstruct_dptr[i]-input_dptr[i];
   }
   Tensor<cpu, 1> gsrc(gsrcptr, Shape1(gsrcblob->count()));
-  gsrc /= srclayers_[0]->data(this).shape()[0];
+  gsrc /= srclayers[0]->data(this).shape()[0];
 }
 
-
 /********** * Implementation for SoftmaxLossLayer*************************/
-void SoftmaxLossLayer::Setup(const LayerProto& proto, int npartitions) {
-  LossLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 2);
-  data_.Reshape(srclayers_[0]->data(this).shape());
+void SoftmaxLossLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 2);
+  LossLayer::Setup(proto, srclayers);
+  data_.Reshape(srclayers[0]->data(this).shape());
   batchsize_ = data_.shape()[0];
   dim_ = data_.count() / batchsize_;
   topk_ = proto.softmaxloss_conf().topk();
-  metric_.Reshape(vector<int>{2});
   scale_ = proto.softmaxloss_conf().scale();
 }
-void SoftmaxLossLayer::ComputeFeature(int flag, Metric* perf) {
+void SoftmaxLossLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
   Shape<2> s = Shape2(batchsize_, dim_);
   Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
-  Tensor<cpu, 2> src(srclayers_[0]->mutable_data(this)->mutable_cpu_data(), s);
+  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
   Softmax(prob, src);
-  const float* label = srclayers_[1]->data(this).cpu_data();
+  const float* label = srclayers[1]->data(this).cpu_data();
   const float* probptr = prob.dptr;
   float loss = 0, precision = 0;
   for (int n = 0; n < batchsize_; n++) {
@@ -108,13 +117,14 @@ void SoftmaxLossLayer::ComputeFeature(int flag, Metric* perf) {
     probptr += dim_;
   }
   CHECK_EQ(probptr, prob.dptr + prob.shape.Size());
-  perf->Add("loss", loss * scale_ / (1.0f * batchsize_));
-  perf->Add("accuracy", precision * scale_ / (1.0f * batchsize_));
+  metric_.Add("loss", loss * scale_ / (1.0f * batchsize_));
+  metric_.Add("accuracy", precision * scale_ / (1.0f * batchsize_));
 }
 
-void SoftmaxLossLayer::ComputeGradient(int flag, Metric* perf) {
-  const float* label = srclayers_[1]->data(this).cpu_data();
-  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
+void SoftmaxLossLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  const float* label = srclayers[1]->data(this).cpu_data();
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
   gsrcblob->CopyFrom(data_);
   float* gsrcptr = gsrcblob->mutable_cpu_data();
   for (int n = 0; n < batchsize_; n++) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 775a5a7..ec23c23 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -94,6 +94,7 @@ NeuralNet::~NeuralNet() {
     delete layer;
 }
 
+/*
 std::string NeuralNet::ToAdjacency() {
   string disp = "";
   for (auto& layer : layers_) {
@@ -104,6 +105,7 @@ std::string NeuralNet::ToAdjacency() {
   }
   return disp;
 }
+*/
 
 void NeuralNet::ShareParamsFrom(NeuralNet* other) {
   for (auto& layer : layers_) {
@@ -215,6 +217,7 @@ Graph* NeuralNet::CreateGraph(const NetProto& netproto, int npartitions) {
         // differentiate partitions
         string nodename = layer.name() + "@" + string(suffix);
         proto->set_partition_id(i);
+        proto->set_num_partitions(npartitions);
         proto->set_name(nodename);
         auto node = new Node(nodename, layer.name(), i, proto);
         graph->AddNode(node);
@@ -321,21 +324,19 @@ void NeuralNet::CreateNetFromGraph(Graph* graph, int npartitions) {
   }
   // connect layers
   for (Node* node : graph->nodes()) {
-    auto layer = name2layer_[node->name];
-    layer->clear_dstlayers();
-    for (Node* dst : node->dstnodes)
-      layer->add_dstlayer(name2layer_[dst->name]);
-    layer->clear_srclayers();
+    auto layer = name2layer(node->name);
+    src_map_[layer] = vector<Layer*>{};
     for (Node* src : node->srcnodes)
-      layer->add_srclayer(name2layer_[src->name]);
+      src_map_[layer].push_back(name2layer(src->name));
   }
+
   // setup layers
   int paramid = 0;
   map<string, string> layerinfo;
   map<string, vector<Layer*>> share_param_layers;
   for (Node* node : graph->nodes()) {
-    auto layer = name2layer_[node->name];
-    layer->Setup(*(static_cast<LayerProto*>(node->proto)), npartitions);
+    auto layer = name2layer(node->name);
+    layer->Setup(*(static_cast<LayerProto*>(node->proto)), srclayers(layer));
     LOG(INFO) << "constructing graph: " << layer->name();
     layerinfo[layer->name()] = IntVecToString(layer->data(nullptr).shape());
     string param_name = "$";

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/neuralnet/neuron_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer.cc b/src/neuralnet/neuron_layer.cc
index 9e7831a..4e3acf0 100644
--- a/src/neuralnet/neuron_layer.cc
+++ b/src/neuralnet/neuron_layer.cc
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -73,17 +73,19 @@ ConvolutionLayer::~ConvolutionLayer() {
   delete weight_;
   delete bias_;
 }
-void ConvolutionLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  ConvolutionProto conv_conf = proto.convolution_conf();
+void ConvolutionLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 1);
+  Layer::Setup(conf, srclayers);
+  ConvolutionProto conv_conf = conf.convolution_conf();
   kernel_ = conv_conf.kernel();
   CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
   pad_ = conv_conf.pad();
   stride_ = conv_conf.stride();
   num_filters_ = conv_conf.num_filters();
   if (partition_dim() > 0)
-    num_filters_ /= npartitions;
-  const vector<int>& srcshape = srclayers_[0]->data(this).shape();
+    num_filters_ /= srclayers.at(0)->num_partitions();
+  const vector<int>& srcshape = srclayers[0]->data(this).shape();
   int dim = srcshape.size();
   CHECK_GT(dim, 2);
   width_ = srcshape[dim - 1];
@@ -102,14 +104,15 @@ void ConvolutionLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.Reshape(shape);
   col_data_.Reshape(vector<int>{col_height_, col_width_});
   col_grad_.Reshape(vector<int>{col_height_, col_width_});
-  weight_ = Param::Create(proto.param(0));
-  bias_ = Param::Create(proto.param(1));
+  weight_ = Param::Create(conf.param(0));
+  bias_ = Param::Create(conf.param(1));
   weight_->Setup(vector<int>{num_filters_, col_height_});
   bias_->Setup(vector<int>{num_filters_});
 }
 
-void ConvolutionLayer::ComputeFeature(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+void ConvolutionLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor3(&data_);
   auto col = Tensor2(&col_data_);
   auto weight = Tensor2(weight_->mutable_data());
@@ -124,15 +127,16 @@ void ConvolutionLayer::ComputeFeature(int flag, Metric* perf) {
   data += expr::broadcast<1>(bias, data.shape);
 }
 
-void ConvolutionLayer::ComputeGradient(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+void ConvolutionLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto col = Tensor2(&col_data_);
   auto weight = Tensor2(weight_->mutable_data());
   auto grad = Tensor3(&grad_);
   auto gcol = Tensor2(&col_grad_);
   auto gweight = Tensor2(weight_->mutable_grad());
   auto gbias = Tensor1(bias_->mutable_grad());
-  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
   Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
   if (gsrcblob != nullptr)
     gsrc.dptr = gsrcblob->mutable_cpu_data();
@@ -157,8 +161,9 @@ void ConvolutionLayer::ComputeGradient(int flag, Metric* perf) {
 }
 
 /******************* Implementation for CConvolutionLayer *********/
-void CConvolutionLayer::ComputeFeature(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+void CConvolutionLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor3(&data_);
   auto col = Tensor2(&col_data_);
   auto weight = Tensor2(weight_->mutable_data());
@@ -172,8 +177,9 @@ void CConvolutionLayer::ComputeFeature(int flag, Metric* perf) {
   data += expr::broadcast<1>(bias, data.shape);
 }
 
-void CConvolutionLayer::ComputeGradient(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+void CConvolutionLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto col = Tensor2(&col_data_);
   auto weight = Tensor2(weight_->mutable_data());
 
@@ -182,7 +188,7 @@ void CConvolutionLayer::ComputeGradient(int flag, Metric* perf) {
   auto gweight = Tensor2(weight_->mutable_grad());
   auto gbias = Tensor1(bias_->mutable_grad());
   gweight = 0.f;
-  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
   Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
   if (gsrcblob != nullptr)
     gsrc.dptr = gsrcblob->mutable_cpu_data();
@@ -200,18 +206,19 @@ void CConvolutionLayer::ComputeGradient(int flag, Metric* perf) {
 }
 
 /****************** Implementation for DropoutLayer ***********************/
-void DropoutLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  data_.ReshapeLike(srclayers_[0]->data(this));
-  grad_.ReshapeLike(*srclayers_[0]->mutable_grad(this));
-  mask_.Reshape(srclayers_[0]->data(this).shape());
-  pdrop_ = proto.dropout_conf().dropout_ratio();
+void DropoutLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
+  mask_.Reshape(srclayers[0]->data(this).shape());
+  pdrop_ = conf.dropout_conf().dropout_ratio();
 }
 
-void DropoutLayer::ComputeFeature(int flag, Metric* perf) {
+void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   // check training
   if ((flag & kTrain) != kTrain) {
-    data_.CopyFrom(srclayers_[0]->data(this));
+    data_.CopyFrom(srclayers[0]->data(this));
     return;
   }
   float pkeep = 1 - pdrop_;
@@ -219,14 +226,14 @@ void DropoutLayer::ComputeFeature(int flag, Metric* perf) {
   mask = expr::F<op::threshold>(TSingleton<Random<cpu>>::Instance() \
                       ->uniform(mask.shape), pkeep) * (1.0f/pkeep);
   auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
   data = src * mask;
 }
 
-void DropoutLayer::ComputeGradient(int flag, Metric* perf)  {
+void DropoutLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)  {
   auto mask = Tensor1(&mask_);
   auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
   gsrc = grad * mask;
 }
 
@@ -251,11 +258,10 @@ Blob<float>* RBMLayer::Sample(int flag) {
   return (flag & kPositive) == kPositive || first_gibbs_ ?
     &sample_ : &neg_sample_;
 }
-void RBMLayer::Setup(const LayerProto& proto, int npartitions) {
-  CHECK_EQ(npartitions, 1);  // TODO(wangwei) test for npartitions > 1
-  Layer::Setup(proto, npartitions);
-  hdim_ = proto.rbm_conf().hdim();
-  gaussian_ = proto.rbm_conf().gaussian();
+void RBMLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  hdim_ = conf.rbm_conf().hdim();
+  gaussian_ = conf.rbm_conf().gaussian();
   first_gibbs_ = true;
 }
 /**************** Implementation for RBMVisLayer********************/
@@ -264,32 +270,33 @@ RBMVisLayer::~RBMVisLayer() {
   delete bias_;
 }
 
-void RBMVisLayer::Setup(const LayerProto& proto, int npartitions) {
-  RBMLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 2);
+void RBMVisLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 2);
+  RBMLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 2);
   hid_layer_ = nullptr;
-  for (auto src : srclayers_) {
-    for (auto dst : src->srclayers()) {
-      if (dst->name() == name()) {
-        CHECK(hid_layer_ == nullptr);
-        hid_layer_ = static_cast<RBMHidLayer*>(src);
-      }
+  for (auto src : srclayers) {
+    if (typeid(*src) == typeid(RBMHidLayer)) {
+      // note the hid layer has may not been set up.
+      CHECK(hid_layer_ == nullptr);
+      hid_layer_ = dynamic_cast<RBMHidLayer*>(src);
     }
   }
-  input_layer_ = srclayers_[0] != hid_layer_ ? srclayers_[0]: srclayers_[1];
+  input_layer_ = srclayers[0] != hid_layer_ ? srclayers[0]: srclayers[1];
   const auto& src = input_layer_->data(this);
   batchsize_ = src.shape()[0];
   data_.ReshapeLike(src);
   neg_data_.ReshapeLike(data_);
   neg_sample_.ReshapeLike(data_);
   vdim_ = src.count() / batchsize_;
-  weight_ = Param::Create(proto.param(0));
+  weight_ = Param::Create(conf.param(0));
   weight_ ->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(proto.param(1));
+  bias_ = Param::Create(conf.param(1));
   bias_->Setup(vector<int>{vdim_});
 }
 
-void RBMVisLayer::ComputeFeature(int flag, Metric* perf) {
+void RBMVisLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if ((flag & kPositive) == kPositive) {
     data_.CopyFrom(input_layer_->data(this), true);
     first_gibbs_ = true;
@@ -308,13 +315,13 @@ void RBMVisLayer::ComputeFeature(int flag, Metric* perf) {
       for (int i = 0; i < data_.count(); i++) {
         err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]);
       }
-      perf->Add("Squared Error", err / batchsize_);
+      metric_.Add("Squared Error", err / batchsize_);
     }
     first_gibbs_ = false;
   }
 }
 
-void RBMVisLayer::ComputeGradient(int flag, Metric* perf) {
+void RBMVisLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto vis_pos = Tensor2(&data_);
   auto vis_neg = Tensor2(&neg_data_);
   auto hid_pos = Tensor2(hid_layer_->mutable_data(this));
@@ -336,25 +343,25 @@ RBMHidLayer::~RBMHidLayer() {
   delete bias_;
 }
 
-void RBMHidLayer::Setup(const LayerProto& proto,
-      int npartitions) {
-  RBMLayer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  const auto& src_data = srclayers_[0]->data(this);
+void RBMHidLayer::Setup(const LayerProto& conf,
+      const vector<Layer*>& srclayers) {
+  RBMLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  const auto& src_data = srclayers[0]->data(this);
   batchsize_ = src_data.shape()[0];
   vdim_ = src_data.count() / batchsize_;
   data_.Reshape(vector<int>{batchsize_, hdim_});
   neg_data_.ReshapeLike(data_);
   sample_.ReshapeLike(data_);
   neg_sample_.ReshapeLike(data_);
-  weight_ = Param::Create(proto.param(0));
+  weight_ = Param::Create(conf.param(0));
   weight_->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(proto.param(1));
+  bias_ = Param::Create(conf.param(1));
   bias_->Setup(vector<int>{hdim_});
-  vis_layer_ = static_cast<RBMVisLayer*> (srclayers_[0]);
+  vis_layer_ = dynamic_cast<RBMVisLayer*> (srclayers[0]);
 }
 
-void RBMHidLayer::ComputeFeature(int flag, Metric* perf) {
+void RBMHidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto weight = Tensor2(weight_->mutable_data());
   auto bias = Tensor1(bias_->mutable_data());
 
@@ -376,7 +383,7 @@ void RBMHidLayer::ComputeFeature(int flag, Metric* perf) {
     data = expr::F<op::sigmoid>(data);
 }
 
-void RBMHidLayer::ComputeGradient(int flag, Metric* perf) {
+void RBMHidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto hid_pos = Tensor2(&data_);
   auto hid_neg = Tensor2(&neg_data_);
   auto gbias = Tensor1(bias_->mutable_grad());
@@ -390,20 +397,21 @@ InnerProductLayer::~InnerProductLayer() {
   delete bias_;
 }
 
-void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  const auto& src = srclayers_[0]->data(this);
+void InnerProductLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  const auto& src = srclayers[0]->data(this);
   batchsize_ = src.shape()[0];
   vdim_ = src.count() / batchsize_;
-  hdim_ = layer_proto_.innerproduct_conf().num_output();
-  transpose_ = proto.innerproduct_conf().transpose();
+  hdim_ = layer_conf_.innerproduct_conf().num_output();
+  transpose_ = conf.innerproduct_conf().transpose();
   if (partition_dim() > 0)
-    hdim_ /= npartitions;
+    hdim_ /= srclayers.at(0)->num_partitions();
   data_.Reshape(vector<int>{batchsize_, hdim_});
   grad_.ReshapeLike(data_);
-  weight_ = Param::Create(proto.param(0));
-  bias_ = Param::Create(proto.param(1));
+  weight_ = Param::Create(conf.param(0));
+  bias_ = Param::Create(conf.param(1));
   if (transpose_)
     weight_->Setup(vector<int>{vdim_, hdim_});
   else
@@ -411,9 +419,10 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
   bias_->Setup(vector<int>{hdim_});
 }
 
-void InnerProductLayer::ComputeFeature(int flag, Metric* perf) {
+void InnerProductLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
   auto data = Tensor2(&data_);
-  auto src = Tensor2(srclayers_[0]->mutable_data(this));
+  auto src = Tensor2(srclayers[0]->mutable_data(this));
   auto weight = Tensor2(weight_->mutable_data());
   auto bias = Tensor1(bias_->mutable_data());
   if (transpose_)
@@ -424,8 +433,9 @@ void InnerProductLayer::ComputeFeature(int flag, Metric* perf) {
   data += expr::repmat(bias, batchsize_);
 }
 
-void InnerProductLayer::ComputeGradient(int flag, Metric* perf) {
-  auto src = Tensor2(srclayers_[0]->mutable_data(this));
+void InnerProductLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor2(srclayers[0]->mutable_data(this));
   auto grad = Tensor2(&grad_);
   auto weight = Tensor2(weight_->mutable_data());
   auto gweight = Tensor2(weight_->mutable_grad());
@@ -436,8 +446,8 @@ void InnerProductLayer::ComputeGradient(int flag, Metric* perf) {
     gweight = dot(src.T(), grad);
   else
     gweight = dot(grad.T(), src);
-  if (srclayers_[0]->mutable_grad(this) != nullptr) {
-    auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this));
+  if (srclayers[0]->mutable_grad(this) != nullptr) {
+    auto gsrc = Tensor2(srclayers[0]->mutable_grad(this));
     if (transpose_)
       gsrc = dot(grad, weight.T());
     else
@@ -445,15 +455,15 @@ void InnerProductLayer::ComputeGradient(int flag, Metric* perf) {
   }
 }
 /***************** Implementation for LRNLayer *************************/
-void LRNLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  lsize_ = proto.lrn_conf().local_size();
+void LRNLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  lsize_ = conf.lrn_conf().local_size();
   CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
-  knorm_ = proto.lrn_conf().knorm();
-  alpha_ = proto.lrn_conf().alpha();
-  beta_ = proto.lrn_conf().beta();
-  const vector<int>& s = srclayers_[0]->data(this).shape();
+  knorm_ = conf.lrn_conf().knorm();
+  alpha_ = conf.lrn_conf().alpha();
+  beta_ = conf.lrn_conf().beta();
+  const vector<int>& s = srclayers[0]->data(this).shape();
   data_.Reshape(s);
   grad_.Reshape(s);
   norm_.Reshape(s);
@@ -463,9 +473,9 @@ void LRNLayer::Setup(const LayerProto& proto, int npartitions) {
   width_ = s[3];
 }
 
-void LRNLayer::ComputeFeature(int flag, Metric* perf) {
+void LRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor4(&data_);
   auto norm = Tensor4(&norm_);
   // stores normalizer without power
@@ -474,12 +484,12 @@ void LRNLayer::ComputeFeature(int flag, Metric* perf) {
   data = src * expr::F<op::power>(norm, -beta_);
 }
 
-void LRNLayer::ComputeGradient(int flag, Metric* perf) {
+void LRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto norm = Tensor4(&norm_);
   auto grad = Tensor4(&grad_);
-  auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this));
+  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
 
   gsrc = grad * expr::F<op::power>(norm, -beta_);
   gsrc += (- 2.0f * beta_ * salpha) * expr::chpool<red::sum>(
@@ -487,18 +497,19 @@ void LRNLayer::ComputeGradient(int flag, Metric* perf) {
 }
 
 /******************** Implementation for PoolingLayer******************/
-void PoolingLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  PoolingProto pool_conf = proto.pooling_conf();
+void PoolingLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  PoolingProto pool_conf = conf.pooling_conf();
   kernel_ = pool_conf.kernel();
   stride_ = pool_conf.stride();
   CHECK_LT(pad_, kernel_);
-  pool_ = proto.pooling_conf().pool();
+  pool_ = conf.pooling_conf().pool();
   CHECK(pool_ == PoolingProto_PoolMethod_AVG
         || pool_ == PoolingProto_PoolMethod_MAX)
         << "Padding implemented only for average and max pooling.";
-  const auto& srcshape = srclayers_[0]->data(this).shape();
+  const auto& srcshape = srclayers[0]->data(this).shape();
   int dim = srcshape.size();
   CHECK_GT(dim, 2);
   width_ = srcshape[dim - 1];
@@ -515,8 +526,8 @@ void PoolingLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.ReshapeLike(data_);
 }
 
-void PoolingLayer::ComputeFeature(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
   auto data = Tensor4(&data_);
   if (pool_ == PoolingProto_PoolMethod_MAX)
     data = expr::pool<red::maximum>(src, kernel_, stride_);
@@ -529,9 +540,9 @@ void PoolingLayer::ComputeFeature(int flag, Metric* perf) {
  * partition only on num/channel dim
  * assume grad and data have the same paritition
  */
-void PoolingLayer::ComputeGradient(int flag, Metric* perf) {
-  auto src = Tensor4(srclayers_[0]->mutable_data(this));
-  auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this));
+void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
   auto data = Tensor4(&data_);
   auto grad = Tensor4(&grad_);
   if (pool_ == PoolingProto_PoolMethod_MAX)
@@ -543,101 +554,99 @@ void PoolingLayer::ComputeGradient(int flag, Metric* perf) {
 
 /***************** Implementation of CPoolingLayer ***************/
 
-void CPoolingLayer::Setup(const LayerProto& proto, int npartitions) {
-  PoolingLayer::Setup(proto, npartitions);
+void CPoolingLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  PoolingLayer::Setup(conf, srclayers);
   if (pool_ == PoolingProto_PoolMethod_MAX)
       mask_.ReshapeLike(data_);
 }
-void CPoolingLayer::ComputeFeature(int flag, Metric* perf) {
+void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   if (pool_ == PoolingProto_PoolMethod_MAX)
-    ForwardMaxPooling(srclayers_[0]->mutable_data(this)->mutable_cpu_data(),
+    ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
         batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
         stride_, stride_, data_.mutable_cpu_data(), mask_.mutable_cpu_data());
   else if (pool_ == PoolingProto_PoolMethod_AVG)
-    ForwardAvgPooling(srclayers_[0]->mutable_data(this)->mutable_cpu_data(),
+    ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
         batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
         stride_, stride_, data_.mutable_cpu_data());
   else
     LOG(FATAL) << "unknow pooling method";
 }
 
-void CPoolingLayer::ComputeGradient(int flag, Metric* perf) {
+void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   if (pool_ == PoolingProto_PoolMethod_MAX)
     BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
         channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_,srclayers_[0]->mutable_grad(this)->mutable_cpu_data());
+        stride_, stride_, srclayers[0]->mutable_grad(this)->mutable_cpu_data());
   else if (pool_ == PoolingProto_PoolMethod_AVG)
     BackwardAvgPooling(grad_.cpu_data(), batchsize_,
         channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_,srclayers_[0]->mutable_grad(this)->mutable_cpu_data());
+        stride_, stride_, srclayers[0]->mutable_grad(this)->mutable_cpu_data());
   else
     LOG(FATAL) << "unknow pooling method";
 }
 
 /***************** Implementation for ReLULayer *****************************/
-void ReLULayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  data_.ReshapeLike(srclayers_[0]->data(this));
-  grad_.ReshapeLike(*(srclayers_[0]->mutable_grad(this)));
+void ReLULayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
 }
 
-void ReLULayer::ComputeFeature(int flag, Metric* perf) {
+void ReLULayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
   data = expr::F<op::relu>(src);
 }
 
-void ReLULayer::ComputeGradient(int flag, Metric* perf) {
+void ReLULayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
   gsrc = expr::F<op::relu_grad>(data)*grad;
 }
 
 /*******************Implementation of SigmoidLayer***************************/
-void SigmoidLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  data_.ReshapeLike(srclayers_[0]->data(this));
-  grad_.ReshapeLike(srclayers_[0]->grad(this));
+void SigmoidLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
 }
 
-void SigmoidLayer::ComputeFeature(int flag, Metric* perf) {
+void SigmoidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
   data = expr::F<op::sigmoid>(src);
 }
 
-void SigmoidLayer::ComputeGradient(int flag, Metric* perf) {
+void SigmoidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
   gsrc = expr::F<op::sigmoid_grad>(data) * grad;
 }
 /*******************Implementation of TanLayer***************************/
-void STanhLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  data_.ReshapeLike(srclayers_[0]->data(this));
-  grad_.ReshapeLike(srclayers_[0]->grad(this));
+void STanhLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
 }
 
-void STanhLayer::ComputeFeature(int flag, Metric* perf) {
+void STanhLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
   data = expr::F<op::stanh>(src);
 }
 
-void STanhLayer::ComputeGradient(int flag, Metric* perf) {
+void STanhLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
   gsrc = expr::F<op::stanh_grad>(data) * grad;
 }
-/********* Implementation for BridgeDstLayer **************/
-void BridgeDstLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
-  CHECK_EQ(srclayers_.size(), 1);
-  data_.Reshape(srclayers_[0]->data(this).shape());
-  grad_.ReshapeLike(data_);
-}
+
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index dc202d9..950f785 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -59,9 +59,9 @@ message JobProto {
   // TODO(wangwei): set -1 for test forever
   optional int32 test_steps =  21 [default = 0];
   // frequency of validation, e.g., do validation every 100 training steps
-  optional int32 valid_freq = 25 [default = 0];
+  optional int32 validate_freq = 25 [default = 0];
   // total num of steps for validating all validation data
-  optional int32 valid_steps = 26 [default = 0];
+  optional int32 validate_steps = 26 [default = 0];
   // frequency of checkpoint
   optional int32 checkpoint_freq = 30 [default = 0];
 
@@ -83,7 +83,7 @@ message JobProto {
   // start test after this num steps
   optional int32 test_after = 82 [default = 0];
   // start validation after this num steps
-  optional int32 valid_after = 83 [default = 0];
+  optional int32 validate_after = 83 [default = 0];
 
   // for internal use
   // users typically do not touch following fields
@@ -224,6 +224,8 @@ message LayerProto {
   optional int32 partition_dim = 60 [default = -1];
   // names of parameters shared from other layers
   optional int32 partition_id = 90 [default = 0];
+  // num of partitions for this layer
+  optional int32 num_partitions = 91 [default = 1];
 
   extensions 101 to 200;
 }
@@ -571,7 +573,7 @@ enum PartitionType {
 enum Phase {
   kUnknown = 0;
   kTrain = 1;
-  kValidation = 2;
+  kVal = 2;
   kTest= 4;
   // postivie phase for contrastive divergence algorithm
   kPositive = 8;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/server.cc
----------------------------------------------------------------------
diff --git a/src/server.cc b/src/server.cc
new file mode 100644
index 0000000..3e0f4cb
--- /dev/null
+++ b/src/server.cc
@@ -0,0 +1,269 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "./server.h"
+
+#include <thread>
+#include <chrono>
+#include "mshadow/tensor.h"
+#include "proto/common.pb.h"
+#include "utils/param.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "utils/cluster.h"
+
+namespace singa {
+
+using namespace mshadow;
+using std::vector;
+
+Server::Server(int group_id, int server_id,
+    const JobProto& job_conf,
+    const vector<int>& slice2group,
+    const vector<int>& slice2server) {
+  grp_id_ = group_id;
+  id_ = server_id;
+  updater_ = Updater::Create(job_conf.updater());
+  slice2group_ = slice2group;
+  slice2server_ = slice2server;
+}
+
+Server::~Server() {
+  delete updater_;
+  // free Params (i.e., slices) in server shard
+  for (auto entry : shard_)
+    for (auto param : entry.second->shares)
+      delete param;
+}
+
+void Stop(void* running) {
+  *static_cast<bool *>(running) = false;
+}
+
+void Server::Run() {
+  LOG(ERROR) << "Server (group = " << grp_id_ <<", id = " << id_ << ") start";
+  auto cluster = Cluster::Get();
+  if (cluster->nserver_groups()) {
+    CHECK_GT(slice2group_.size(), 0);
+    if (cluster->nservers_per_group()) {
+      CHECK_GT(slice2server_.size(), 0);
+    }
+  }
+  n_updates_.resize(slice2group_.size(), 0);
+  n_pending_sync_.resize(slice2group_.size(), 0);
+  last_sync_.resize(slice2group_.size());
+
+  // TODO(wangsh): give each dealer a unique id
+  auto dealer = new Dealer(0);
+  CHECK(dealer->Connect(kInprocRouterEndpoint));
+  Msg* ping = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
+  ping->set_type(kConnect);
+  dealer->Send(&ping);
+
+  bool running = true;
+  CHECK(cluster->runtime()->WatchSGroup(grp_id_, id_, Stop, &running));
+  Poller poll(dealer);
+  // start recv loop and process requests
+  while (running) {
+    // must use poller here; otherwise Receive() gets stuck after workers stop.
+    auto* sock = poll.Wait(cluster->poll_time());
+    if (poll.Terminated()) {
+      LOG(ERROR) << "Connection broken!";
+      exit(0);
+    } else if (sock == nullptr) {
+      continue;
+    }
+    Msg* msg = dealer->Receive();
+    if (msg == nullptr) break;  // interrupted
+    Msg* response = nullptr;
+    int type = msg->type();
+    int slice_id = SliceID(msg->trgt_val());
+    if (type == kPut) {
+      response = HandlePut(&msg);
+    } else if (shard_.find(slice_id) == shard_.end()) {
+      // TODO(wangsh): buffer the msg instead, and process it after the
+      //               corresponding put request is done
+      // delay the processing by re-queue the msg. May sleep for a while?
+      response = msg;
+    } else {
+      switch (type) {
+        case kGet:
+          response = HandleGet(&msg);
+          break;
+        case kUpdate:
+          for (auto reply : HandleUpdate(&msg))
+            dealer->Send(&reply);
+          break;
+        case kSyncRequest:
+          response = HandleSyncRequest(&msg);
+          break;
+        case kSyncResponse:
+          HandleSyncResponse(&msg);
+          break;
+        default:
+          LOG(ERROR) << "Unknown message type: " << type;
+          break;
+      }
+    }
+    if (response != nullptr)
+      dealer->Send(&response);
+  }
+
+  // send stop msg to stub
+  Msg* msg = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
+  msg->set_type(kStop);
+  dealer->Send(&msg);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  LOG(ERROR) << "Server (group = " << grp_id_ << ", id = " << id_ << ") stops";
+  delete dealer;
+}
+
+Msg* Server::HandlePut(Msg **msg) {
+  int version = (*msg)->trgt_version();
+  int slice_id = SliceID((*msg)->trgt_val());
+  if (shard_.find(slice_id) != shard_.end())
+    LOG(FATAL) << "Param (" << slice_id << ") is put more than once";
+
+  // TODO(wangwei) replace hard coded param type 0
+  auto  param = Singleton<Factory<Param>>::Instance()->Create(0);
+  auto response = param->HandlePutMsg(msg, true);
+  // parse num of shares of this param from a worker group
+  int num_shares = 1;
+  if ((*msg)->NextFrame())
+    (*msg)->ParseFormatFrame("i", &num_shares);
+  DeleteMsg(msg);
+  shard_[slice_id] = new ParamEntry(num_shares, param);
+  // must set version after HandlePutMsg which allocates the memory
+  param->set_version(version);
+  param->set_local_version(version);
+  param->set_id(slice_id);
+  // allocate blob for param sync between groups.
+  if (slice2group_[slice_id] != grp_id_) {
+    last_sync_[slice_id].ReshapeLike(param->data());
+    last_sync_[slice_id].CopyFrom(param->data());
+  }
+  LOG(INFO) << "server (group = " << grp_id_ << ", id = " << id_
+            <<") put slice=" << slice_id << " size=" << param->size();
+  return response;
+}
+
+Msg* Server::HandleGet(Msg **msg) {
+  int val = (*msg)->trgt_val();
+  auto param = shard_.at(SliceID(val))->shares.at(0);
+  // re-queue the request if the param is not updated to the required version
+  if (param->version() < (*msg)->trgt_version()) {
+    return *msg;
+  } else {
+    // LOG(ERROR) << "get " << slice << " from "<<(*msg)->src_first();
+    auto reply = param->HandleGetMsg(msg, false);
+    reply->set_trgt(val, param->version());
+    return reply;
+  }
+}
+
+const vector<Msg*> Server::HandleUpdate(Msg **msg) {
+  vector<Msg*> ret;
+  int sliceid = SliceID((*msg)->trgt_val());
+  auto entry = shard_.at(sliceid);
+  buffer_requests_[sliceid].push_back(*msg);
+  int num_update;
+  (*msg)->LastFrame();
+  (*msg)->ParseFormatFrame("i", &num_update);
+  (*msg)->FirstFrame();
+  entry->num_update += num_update;
+  // LOG(ERROR) << "update "<< sliceid << " from " << AddrGrp((*msg)->src())
+  //            << ", " << num_update << " total " << entry->num_total;
+  // do update until recv gradients from all shares of this param/slice
+  if (entry->num_update >= entry->num_total) {
+    CHECK_EQ(entry->num_update, entry->num_total);
+    auto& request = buffer_requests_.at(sliceid);
+    int step = (*msg)->trgt_version();
+    int trgt_val = (*msg)->trgt_val();
+    auto param = entry->shares.at(0);
+    // extract and aggregate gradients
+    param->ParseUpdateMsgs(request);
+    updater_->Update(step, param, 1.0f / entry->num_total);
+    param->set_local_version(param->local_version() + 1);
+    // response to all shares of this param
+    for (auto response : param->GenUpdateResponseMsgs(&request, false)) {
+      response->set_trgt(trgt_val, param->local_version());
+      ret.push_back(response);
+    }
+    entry->num_update = 0;
+    n_updates_[sliceid]++;
+    // sync with master group after at least sync_freq local updates
+    // the last check is to avoid sending msg to stopped servers
+    if (slice2group_[sliceid] != grp_id_
+        && n_updates_[sliceid] >= Cluster::Get()->sync_freq()
+        && n_pending_sync_[sliceid] <= Cluster::Get()->sync_freq()) {
+      auto shape = Shape1(param->size());
+      Tensor<cpu, 1> tmp(last_sync_[sliceid].mutable_cpu_data(), shape);
+      Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
+      tmp = cur - tmp;
+      int addr = Addr(slice2group_[sliceid], slice2server_[sliceid], kServer);
+      Msg* sync = new Msg(Addr(grp_id_, id_, kServer), addr);
+      sync->set_type(kSyncRequest);
+      sync->set_trgt(trgt_val, param->local_version());
+      sync->AddFrame(tmp.dptr, param->size() * sizeof(float));
+      Copy(tmp, cur);
+      ret.push_back(sync);
+      n_updates_[sliceid] = 0;
+      n_pending_sync_[sliceid]++;
+    }
+  }
+  // message already pushed to buffer, just need to reset the pointer
+  *msg = nullptr;
+  return ret;
+}
+
+Msg* Server::HandleSyncRequest(Msg **msg) {
+  Msg* msgg = *msg;
+  int slice = SliceID(msgg->trgt_val());
+  auto param = shard_.at(slice)->shares.at(0);
+  auto shape = Shape1(param->size());
+  CHECK_EQ(msgg->FrameSize(), param->size()*sizeof(float));
+  Tensor<cpu, 1> inc(static_cast<float*>(msgg->FrameData()), shape);
+  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
+  // recv sync msg on the slice I am maintaining
+  cur += inc;
+  msgg->SwapAddr();
+  msgg->set_type(kSyncResponse);
+  // copy the fresh param value into the response msg
+  Copy(inc, cur);
+  return msgg;
+}
+
+// recv sync msg on slice mastered by others
+void Server::HandleSyncResponse(Msg **msg) {
+  Msg* msgg = *msg;
+  int slice = SliceID(msgg->trgt_val());
+  auto param = shard_.at(slice)->shares.at(0);
+  auto shape = Shape1(param->size());
+  Tensor<cpu, 1> prev(last_sync_[param->id()].mutable_cpu_data(), shape);
+  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
+  Tensor<cpu, 1> master(static_cast<float*>(msgg->FrameData()), shape);
+  cur += master - prev;  // cur = master + (cur - prev);
+  Copy(prev, cur);
+  DeleteMsg(msg);
+  n_pending_sync_[slice]--;
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/stub.cc
----------------------------------------------------------------------
diff --git a/src/stub.cc b/src/stub.cc
new file mode 100644
index 0000000..7b439e5
--- /dev/null
+++ b/src/stub.cc
@@ -0,0 +1,285 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "./stub.h"
+
+#include <glog/logging.h>
+#include <unistd.h>
+#include <map>
+#include <thread>
+#include <set>
+#include "mshadow/tensor.h"
+#include "proto/common.pb.h"
+#include "utils/cluster.h"
+#include "utils/common.h"
+#include "utils/tinydir.h"
+
+namespace singa {
+
+using std::vector;
+using std::string;
+
+/***********************Stub****************************/
+Stub::~Stub() {
+  delete router_;
+}
+void Stub::Setup() {
+  router_ = new Router();
+  router_->Bind(kInprocRouterEndpoint);
+  auto cluster = Cluster::Get();
+  const string hostip = cluster->hostip();
+  int port = router_->Bind("tcp://" + hostip + ":*");
+  endpoint_ = hostip + ":" + std::to_string(port);
+}
+/**
+ * Get a hash id for a Param object from a group.
+ *
+ * Simple multiple group_id with a large prime number 997 (assuming there are
+ * no more than 997 worker groups) and plus owner param id.
+ */
+inline int Hash(int grp_id, int param_id) {
+  return grp_id * 997 + param_id;
+}
+const std::unordered_map<int, ParamEntry*>  CreateParamShard(
+    const vector<Worker*>& workers) {
+  std::unordered_map<int, ParamEntry*> shard;
+  // grp id -> net
+  std::unordered_map<int, NeuralNet*> grp2net;
+  // grp id -> worker id range
+  std::unordered_map<int, std::pair<int, int>> grp2workers;
+  for (auto worker : workers) {
+    int grp = worker->grp_id(), id = worker->id();
+    if (grp2net.find(grp) == grp2net.end()) {
+      grp2net[grp] = worker->train_net();
+      grp2workers[grp] = std::make_pair(id, id + 1);
+    } else {
+      CHECK_EQ(grp2net[grp], worker->train_net());
+      int start = grp2workers[grp].first, end = grp2workers[grp].second;
+      if (start > id) start = id;
+      if (end < id + 1) end = id + 1;
+      grp2workers[grp] = std::make_pair(start, end);
+    }
+  }
+
+  for (const auto entry : grp2net) {
+    int grp = entry.first;
+    int wstart = grp2workers[grp].first, wend = grp2workers[grp].second;
+    for (auto layer : entry.second->layers()) {
+      int partition = layer->partition_id();
+      bool local =  partition >= wstart && partition < wend;
+      for (auto param : layer->GetParams()) {
+        int hash = Hash(grp, param->owner());
+        if (shard.find(hash) == shard.end())
+          shard[hash] = new ParamEntry();
+        shard[hash]->AddParam(local, param);
+      }
+    }
+  }
+  return shard;
+}
+
+void Stub::Run(const vector<int>& slice2server,
+    const vector<Worker*>& workers, const vector<Server*>& servers) {
+  slice2server_ = slice2server;
+  int nworkers = workers.size(), nservers = servers.size();
+  auto cluster = Cluster::Get();
+  int procs_id = cluster->procs_id();
+  LOG(INFO) << "Stub in process " << procs_id << " starts";
+  auto shard = CreateParamShard(workers);
+  std::map<int, Dealer*> inter_dealers;  // for sending msg to other procs
+  std::queue<Msg*> msg_queue;
+  while (true) {
+    Msg* msg = nullptr;
+    if (msg_queue.empty()) {
+      msg = router_->Receive();
+    } else {
+      msg = msg_queue.front();
+      msg_queue.pop();
+    }
+    int type = msg->type(), dst = msg->dst(), flag = AddrType(dst);
+    if (flag == kStub && (AddrProc(dst) == procs_id || AddrGrp(dst) == -1)) {
+      //  the following statements are ordered!
+      if (type == kConnect) {
+        DeleteMsg(&msg);
+      } else if (type == kStop) {
+        int src_flag = AddrType(msg->src());
+        if (src_flag == kServer) nservers--;
+        else if (src_flag == kWorkerParam) nworkers--;
+        DeleteMsg(&msg);
+        if (nworkers == 0 && nservers == 0) break;
+      } else {
+        int grp;
+        int paramid = ParamID(msg->trgt_val());
+        ParamEntry *entry = nullptr;
+        switch (type) {
+          case kUpdate:
+            grp = AddrGrp(msg->src());
+            entry = shard.at(Hash(grp, paramid));
+            for (auto update_msg : HandleUpdateRequest(entry, &msg))
+              msg_queue.push(update_msg);
+            break;
+          case kRUpdate:
+            grp = AddrGrp(msg->dst());
+            entry = shard.at(Hash(grp, paramid));
+            HandleUpdateResponse(entry, &msg);
+            break;
+          case kGet:
+            grp = AddrGrp(msg->src());
+            entry = shard.at(Hash(grp, paramid));
+            for (auto get_msg : HandleGetRequest(entry, &msg))
+              msg_queue.push(get_msg);
+            break;
+          case kRGet:
+            grp = AddrGrp(msg->dst());
+            entry = shard.at(Hash(grp, paramid));
+            HandleGetResponse(entry, &msg);
+            break;
+          case kPut:
+            grp = AddrGrp(msg->src());
+            entry = shard.at(Hash(grp, paramid));
+            for (auto put_msg : HandlePutRequest(entry, &msg))
+              msg_queue.push(put_msg);
+            break;
+          default:
+            LOG(ERROR) << "Unknow message type:" << type;
+            break;
+        }
+      }
+    } else {
+      int dst_procs = AddrProc(dst);
+      if (flag != kStub)
+        dst_procs = cluster->ProcsIDOf(AddrGrp(dst), AddrID(dst), flag);
+      if (dst_procs != procs_id) {
+        if (inter_dealers.find(dst_procs) == inter_dealers.end())
+          inter_dealers[dst_procs] = CreateInterProcsDealer(dst_procs);
+        inter_dealers[dst_procs]->Send(&msg);
+      } else {
+        router_->Send(&msg);
+      }
+    }
+  }
+  LOG(ERROR) << "Stub in process " << procs_id << " stops";
+  for (auto& entry : inter_dealers)
+    delete entry.second;
+}
+
+Dealer* Stub::CreateInterProcsDealer(int dst_procs) {
+  // forward to other procs
+  auto cluster = Cluster::Get();
+  auto dealer = new Dealer();
+  while (cluster->endpoint(dst_procs) == "") {
+    // kCollectSleepTime));
+    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
+    LOG(ERROR) << "waiting for procs " << dst_procs << " to register";
+  }
+  dealer->Connect("tcp://"+cluster->endpoint(dst_procs));
+  return dealer;
+}
+
+void Stub::GenMsgs(int type, int version, ParamEntry* entry, Msg* msg,
+                      vector<Msg*> *ret) {
+  int procs_id = Cluster::Get()->procs_id();
+  int src_grp = AddrGrp(msg->src());
+  int dst_grp = src_grp / Cluster::Get()->nworker_groups_per_server_group();
+  auto param = entry->shares.at(0);
+  for (int idx = 0 ; idx < param->num_slices(); idx++) {
+    int slice_id = param->slice_start() + idx;
+    int server = slice2server_[slice_id];
+    int dst_procs = Cluster::Get()->ProcsIDOf(dst_grp, server, kServer);
+    Msg* new_msg = nullptr;
+    if (type == kPut) {
+      CHECK_GT(entry->num_total, 0);
+      new_msg = param->GenPutMsg(dst_procs != procs_id, idx);
+      new_msg->AddFormatFrame("i", entry->num_total);
+    } else if (type == kGet) {
+      new_msg = param->GenGetMsg(dst_procs != procs_id, idx);
+    } else if (type == kUpdate) {
+      new_msg = param->GenUpdateMsg(dst_procs != procs_id, idx);
+      new_msg->AddFormatFrame("i", entry->num_local);
+    } else {
+      LOG(FATAL) << "Wrong type";
+    }
+    new_msg->set_trgt(ParamTrgt(param->owner(), slice_id), version);
+    new_msg->set_src(Addr(src_grp, procs_id, kStub));
+    new_msg->set_dst(Addr(dst_grp, server, kServer));
+    ret->push_back(new_msg);
+  }
+}
+
+const vector<Msg*> Stub::HandleGetRequest(ParamEntry* entry, Msg** msg) {
+  vector<Msg*> ret;
+  int version = (*msg)->trgt_version();
+  if (version > entry->next_version) {
+    entry->next_version = version;
+    GenMsgs(kGet, version, entry, *msg, &ret);
+  }
+  DeleteMsg(msg);
+  return ret;
+}
+
+const vector<Msg*> Stub::HandleUpdateRequest(ParamEntry *entry, Msg** msg) {
+  vector<Msg*> ret;
+  entry->num_update++;
+  if (entry->num_update >= entry->num_local) {
+    // average local gradient
+    if (entry->num_local > 1) {
+      auto it = entry->shares.begin();
+      auto shape = mshadow::Shape1((*it)->size());
+      mshadow::Tensor<mshadow::cpu, 1> sum((*it)->mutable_cpu_grad(), shape);
+      for (++it; it != entry->shares.end(); it++) {
+        mshadow::Tensor<mshadow::cpu, 1> grad((*it)->mutable_cpu_grad(), shape);
+        sum += grad;
+      }
+    }
+    int step = (*msg)->trgt_version();
+    GenMsgs(kUpdate, step, entry, *msg, &ret);
+    entry->num_update = 0;
+  }
+  DeleteMsg(msg);
+  return ret;
+}
+
+const vector<Msg*> Stub::HandlePutRequest(ParamEntry* entry, Msg** msg) {
+  vector<Msg*> ret;
+  int version = (*msg)->trgt_version();
+  GenMsgs(kPut, version, entry, *msg, &ret);
+  DeleteMsg(msg);
+  return ret;
+}
+
+void Stub::HandleGetResponse(ParamEntry* entry, Msg** msg) {
+  int version = (*msg)->trgt_version();
+  int sliceid = SliceID((*msg)->trgt_val());
+  auto param = entry->shares.at(0);
+  if (param->ParseGetResponseMsg(*msg, sliceid-param->slice_start()))
+    param->set_version(version);
+  DeleteMsg(msg);
+}
+
+void Stub::HandleUpdateResponse(ParamEntry* entry, Msg** msg) {
+  int version = (*msg)->trgt_version();
+  int sliceid = SliceID((*msg)->trgt_val());
+  auto param = entry->shares.at(0);
+  if (param->ParseUpdateResponseMsg(*msg, sliceid-param->slice_start()))
+    param->set_version(version);
+  DeleteMsg(msg);
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/321ef96a/src/trainer/server.cc
----------------------------------------------------------------------
diff --git a/src/trainer/server.cc b/src/trainer/server.cc
deleted file mode 100644
index 5e74c1b..0000000
--- a/src/trainer/server.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "trainer/server.h"
-
-#include <thread>
-#include <chrono>
-#include "mshadow/tensor.h"
-#include "proto/common.pb.h"
-#include "utils/param.h"
-#include "utils/singleton.h"
-#include "utils/factory.h"
-#include "utils/cluster.h"
-
-namespace singa {
-
-using namespace mshadow;
-using std::vector;
-
-Server::Server(int group_id, int server_id) {
-  grp_id_ = group_id;
-  id_ = server_id;
-}
-
-void Server::Setup(const UpdaterProto& proto, const vector<int>& slice2group,
-                   const vector<int>& slice2server) {
-  updater_ = Updater::Create(proto);
-  slice2group_ = slice2group;
-  slice2server_ = slice2server;
-  n_updates_.resize(slice2group_.size(), 0);
-  n_pending_sync_.resize(slice2group_.size(), 0);
-  last_sync_.resize(slice2group_.size());
-}
-
-Server::~Server() {
-  delete updater_;
-  // free Params (i.e., slices) in server shard
-  for (auto entry : shard_)
-    for (auto param : entry.second->shares)
-      delete param;
-}
-
-void Stop(void* running) {
-  *static_cast<bool *>(running) = false;
-}
-
-void Server::Run() {
-  LOG(ERROR) << "Server (group = " << grp_id_ <<", id = " << id_ << ") start";
-  // TODO(wangsh): give each dealer a unique id
-  auto dealer = new Dealer(0);
-  CHECK(dealer->Connect(kInprocRouterEndpoint));
-  Msg* ping = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
-  ping->set_type(kConnect);
-  dealer->Send(&ping);
-
-  auto cluster = Cluster::Get();
-  bool running = true;
-  CHECK(cluster->runtime()->WatchSGroup(grp_id_, id_, Stop, &running));
-  Poller poll(dealer);
-  // start recv loop and process requests
-  while (running) {
-    // must use poller here; otherwise Receive() gets stuck after workers stop.
-    auto* sock = poll.Wait(cluster->poll_time());
-    if (poll.Terminated()) {
-      LOG(ERROR) << "Connection broken!";
-      exit(0);
-    } else if (sock == nullptr) {
-      continue;
-    }
-    Msg* msg = dealer->Receive();
-    if (msg == nullptr) break;  // interrupted
-    Msg* response = nullptr;
-    int type = msg->type();
-    int slice_id = SliceID(msg->trgt_val());
-    if (type == kPut) {
-      response = HandlePut(&msg);
-    } else if (shard_.find(slice_id) == shard_.end()) {
-      // TODO(wangsh): buffer the msg instead, and process it after the
-      //               corresponding put request is done
-      // delay the processing by re-queue the msg. May sleep for a while?
-      response = msg;
-    } else {
-      switch (type) {
-        case kGet:
-          response = HandleGet(&msg);
-          break;
-        case kUpdate:
-          for (auto reply : HandleUpdate(&msg))
-            dealer->Send(&reply);
-          break;
-        case kSyncRequest:
-          response = HandleSyncRequest(&msg);
-          break;
-        case kSyncResponse:
-          HandleSyncResponse(&msg);
-          break;
-        default:
-          LOG(ERROR) << "Unknown message type: " << type;
-          break;
-      }
-    }
-    if (response != nullptr)
-      dealer->Send(&response);
-  }
-
-  // send stop msg to stub
-  Msg* msg = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
-  msg->set_type(kStop);
-  dealer->Send(&msg);
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  LOG(ERROR) << "Server (group = " << grp_id_ << ", id = " << id_ << ") stops";
-  delete dealer;
-}
-
-Msg* Server::HandlePut(Msg **msg) {
-  int version = (*msg)->trgt_version();
-  int slice_id = SliceID((*msg)->trgt_val());
-  if (shard_.find(slice_id) != shard_.end())
-    LOG(FATAL) << "Param (" << slice_id << ") is put more than once";
-
-  // TODO(wangwei) replace hard coded param type 0
-  auto  param = Singleton<Factory<Param>>::Instance()->Create(0);
-  auto response = param->HandlePutMsg(msg, true);
-  // parse num of shares of this param from a worker group
-  int num_shares = 1;
-  if ((*msg)->NextFrame())
-    (*msg)->ParseFormatFrame("i", &num_shares);
-  DeleteMsg(msg);
-  shard_[slice_id] = new ParamEntry(num_shares, param);
-  // must set version after HandlePutMsg which allocates the memory
-  param->set_version(version);
-  param->set_local_version(version);
-  param->set_id(slice_id);
-  // allocate blob for param sync between groups.
-  if (slice2group_[slice_id] != grp_id_) {
-    last_sync_[slice_id].ReshapeLike(param->data());
-    last_sync_[slice_id].CopyFrom(param->data());
-  }
-  LOG(INFO) << "server (group = " << grp_id_ << ", id = " << id_
-            <<") put slice=" << slice_id << " size=" << param->size();
-  return response;
-}
-
-Msg* Server::HandleGet(Msg **msg) {
-  int val = (*msg)->trgt_val();
-  auto param = shard_.at(SliceID(val))->shares.at(0);
-  // re-queue the request if the param is not updated to the required version
-  if (param->version() < (*msg)->trgt_version()) {
-    return *msg;
-  } else {
-    // LOG(ERROR) << "get " << slice << " from "<<(*msg)->src_first();
-    auto reply = param->HandleGetMsg(msg, false);
-    reply->set_trgt(val, param->version());
-    return reply;
-  }
-}
-
-const vector<Msg*> Server::HandleUpdate(Msg **msg) {
-  vector<Msg*> ret;
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto entry = shard_.at(sliceid);
-  buffer_requests_[sliceid].push_back(*msg);
-  int num_update;
-  (*msg)->LastFrame();
-  (*msg)->ParseFormatFrame("i", &num_update);
-  (*msg)->FirstFrame();
-  entry->num_update += num_update;
-  // LOG(ERROR) << "update "<< sliceid << " from " << AddrGrp((*msg)->src())
-  //            << ", " << num_update << " total " << entry->num_total;
-  // do update until recv gradients from all shares of this param/slice
-  if (entry->num_update >= entry->num_total) {
-    CHECK_EQ(entry->num_update, entry->num_total);
-    auto& request = buffer_requests_.at(sliceid);
-    int step = (*msg)->trgt_version();
-    int trgt_val = (*msg)->trgt_val();
-    auto param = entry->shares.at(0);
-    // extract and aggregate gradients
-    param->ParseUpdateMsgs(request);
-    updater_->Update(step, param, 1.0f / entry->num_total);
-    param->set_local_version(param->local_version() + 1);
-    // response to all shares of this param
-    for (auto response : param->GenUpdateResponseMsgs(&request, false)) {
-      response->set_trgt(trgt_val, param->local_version());
-      ret.push_back(response);
-    }
-    entry->num_update = 0;
-    n_updates_[sliceid]++;
-    // sync with master group after at least sync_freq local updates
-    // the last check is to avoid sending msg to stopped servers
-    if (slice2group_[sliceid] != grp_id_
-        && n_updates_[sliceid] >= Cluster::Get()->sync_freq()
-        && n_pending_sync_[sliceid] <= Cluster::Get()->sync_freq()) {
-      auto shape = Shape1(param->size());
-      Tensor<cpu, 1> tmp(last_sync_[sliceid].mutable_cpu_data(), shape);
-      Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-      tmp = cur - tmp;
-      int addr = Addr(slice2group_[sliceid], slice2server_[sliceid], kServer);
-      Msg* sync = new Msg(Addr(grp_id_, id_, kServer), addr);
-      sync->set_type(kSyncRequest);
-      sync->set_trgt(trgt_val, param->local_version());
-      sync->AddFrame(tmp.dptr, param->size() * sizeof(float));
-      Copy(tmp, cur);
-      ret.push_back(sync);
-      n_updates_[sliceid] = 0;
-      n_pending_sync_[sliceid]++;
-    }
-  }
-  // message already pushed to buffer, just need to reset the pointer
-  *msg = nullptr;
-  return ret;
-}
-
-Msg* Server::HandleSyncRequest(Msg **msg) {
-  Msg* msgg = *msg;
-  int slice = SliceID(msgg->trgt_val());
-  auto param = shard_.at(slice)->shares.at(0);
-  auto shape = Shape1(param->size());
-  CHECK_EQ(msgg->FrameSize(), param->size()*sizeof(float));
-  Tensor<cpu, 1> inc(static_cast<float*>(msgg->FrameData()), shape);
-  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-  // recv sync msg on the slice I am maintaining
-  cur += inc;
-  msgg->SwapAddr();
-  msgg->set_type(kSyncResponse);
-  // copy the fresh param value into the response msg
-  Copy(inc, cur);
-  return msgg;
-}
-
-// recv sync msg on slice mastered by others
-void Server::HandleSyncResponse(Msg **msg) {
-  Msg* msgg = *msg;
-  int slice = SliceID(msgg->trgt_val());
-  auto param = shard_.at(slice)->shares.at(0);
-  auto shape = Shape1(param->size());
-  Tensor<cpu, 1> prev(last_sync_[param->id()].mutable_cpu_data(), shape);
-  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-  Tensor<cpu, 1> master(static_cast<float*>(msgg->FrameData()), shape);
-  cur += master - prev;  // cur = master + (cur - prev);
-  Copy(prev, cur);
-  DeleteMsg(msg);
-  n_pending_sync_[slice]--;
-}
-
-}  // namespace singa