You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/07/11 10:53:59 UTC

[1/3] incubator-singa git commit: SINGA-29 Update NeuralNet class to enable customizing layer partition type

Repository: incubator-singa
Updated Branches:
  refs/heads/master ea7cfea49 -> 9a6e09fa2


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/utils/graph.cc
----------------------------------------------------------------------
diff --git a/src/utils/graph.cc b/src/utils/graph.cc
index b1f5b9f..d92e241 100644
--- a/src/utils/graph.cc
+++ b/src/utils/graph.cc
@@ -1,166 +1,202 @@
+
+#include "utils/graph.h"
+#include <glog/logging.h>
 #include <algorithm>
 #include <queue>
 #include <unordered_set>
-#include "utils/graph.h"
 
-const string Graph::ToString() const {
+namespace singa {
+/************************Node********************************/
+
+Node::~Node() {
+  // the proto field is deleted outside by other functions
+}
+
+Node::Node(string name) {
+  this->name = name;
+}
+
+Node::Node(const string& name, const string& origin, int id, void* proto) {
+  this->name = name;
+  this->origin = origin;
+  this->proto = proto;
+  this->partition_id = id;
+}
+
+void Node::AddDstNode(Node* dstnode) {
+  dstnodes.push_back(dstnode);
+}
+
+void Node::AddSrcNode(Node* srcnode) {
+  srcnodes.push_back(srcnode);
+}
+
+void Node::RemoveDstNode(Node* dst) {
+  auto iter = dstnodes.begin();
+  while ((*iter)->name != dst->name && iter != dstnodes.end())
+    iter++;
+  CHECK_STREQ((*iter)->name.c_str(), dst->name.c_str());
+  dstnodes.erase(iter);
+}
+
+void Node::RemoveSrcNode(Node* src) {
+  auto iter = srcnodes.begin();
+  while ((*iter)->name != src->name && iter != srcnodes.end())
+    iter++;
+  CHECK((*iter)->name == src->name);
+  srcnodes.erase(iter);
+}
+
+/*************************Graph****************************/
+Graph::~Graph() {
+  for (Node* node : nodes_)
+    delete node;
+}
+
+void Graph::AddNode(Node* node) {
+  nodes_.push_back(node);
+  name2node_[node->name] = node;
+}
+
+Node* Graph::AddNode(const string& name) {
+  Node* node = new Node(name);
+  AddNode(node);
+  return node;
+}
+
+void Graph::AddEdge(Node* srcnode, Node* dstnode) {
+  srcnode->AddDstNode(dstnode);
+  dstnode->AddSrcNode(srcnode);
+}
+
+void Graph::AddEdge(const string& src, const string& dst) {
+  CHECK(name2node_.find(src) != name2node_.end())
+    <<"can't find src node " << src;
+  CHECK(name2node_.find(dst) != name2node_.end())
+    <<"can't find dst node " << dst;
+
+  Node* srcnode = name2node_[src], *dstnode = name2node_[dst];
+  AddEdge(srcnode, dstnode);
+}
+
+void Graph::RemoveEdge(Node* src, Node* dst) {
+  src->RemoveDstNode(dst);
+  dst->RemoveSrcNode(src);
+}
+
+void Graph::RemoveEdge(const string &src, const string& dst) {
+  CHECK(name2node_.find(src) != name2node_.end())
+    <<"can't find src node " << src;
+  CHECK(name2node_.find(dst) != name2node_.end())
+    <<"can't find dst node " << dst;
+
+  Node* srcnode = name2node_[src], *dstnode = name2node_[dst];
+  RemoveEdge(srcnode, dstnode);
+}
+
+const string Graph::ToJson() const {
   map<string, string> info;
-  return ToString(info);
+  return ToJson(info);
 }
-const string Graph::ToString(const map<string, string>& info) const {
+
+const string Graph::ToJson(const map<string, string>& info) const {
   map<string, int> nodeid;
-  string disp="{\"directed\":1,\n";
+  string disp = "{\"directed\":1,\n";
 
   // add nodes
-  disp+="\"nodes\":[\n";
-  bool first=true;
+  disp += "\"nodes\":[\n";
+  bool first = true;
 
-  vector<string> colors={"red", "blue", "black", "green"};
+  vector<string> colors = {"red", "blue", "black", "green"};
   // see for more shapes at http://www.graphviz.org/doc/info/shapes.html
-  vector<string> shapes={"box", "ellipse"};
-  int id=0;
-  for(auto node: nodes_){
+  vector<string> shapes = {"box", "ellipse"};
+  int id = 0;
+  for (auto node : nodes_) {
     char str[1024];
-    string name=node->name();
-    string color=colors[(node->val().partitionid)%colors.size()];
+    string name = node->name;
+    string color = colors[(node->partition_id)%colors.size()];
     string shape;
-    string origin=node->val().origin;
-    if(origin=="kSlice"||origin=="kConcate"||origin=="kSplit"
-        ||origin=="kBridgeSrc"||origin=="kBridgeDst")
-      shape=shapes[1];
+    string origin = node->origin;
+    if (origin.find("##") != string::npos)
+      shape = shapes[1];
     else
-      shape=shapes[0];
-    sprintf(str, "{\"id\":\"%s%s\", \"color\":\"%s\",\"shape\":\"%s\"}\n",
-        name.c_str(), info.find(name)!=info.end()?info.at(name).c_str():"",
+      shape = shapes[0];
+    snprintf(str, sizeof(str),
+        "{\"id\":\"%s%s\", \"color\":\"%s\",\"shape\":\"%s\"}\n", name.c_str(),
+        info.find(name) != info.end() ? info.at(name).c_str() : "",
         color.c_str(), shape.c_str());
-    if(!first)
-      disp+=",";
+    if (!first)
+      disp += ",";
     else
-      first=false;
-    disp+=string(str);
-    nodeid[name]=id++;
+      first = false;
+    disp += string(str);
+    nodeid[name] = id++;
   }
-  disp+="]\n,";
+  disp += "]\n,";
 
   // add edges
-  disp+="\"links\":[\n";
-  first=true;
-  for(auto src: nodes_)
-    for(auto dst: src->dstnodes()){
-    char str[1024];
-    sprintf(str, "{\"source\":%d, \"target\":%d, \"color\":\"%s\"}\n",
-        nodeid[src->name()], nodeid[dst->name()], "black");
-    if(!first)
-      disp+=",";
-    else
-      first=false;
-    disp+=string(str);
+  disp += "\"links\":[\n";
+  first = true;
+  for (auto src : nodes_) {
+    for (auto dst : src->dstnodes) {
+      char str[1024];
+      snprintf(str, sizeof(str),
+          "{\"source\":%d, \"target\":%d, \"color\":\"%s\"}\n",
+          nodeid[src->name], nodeid[dst->name], "black");
+      if (!first)
+        disp += ",";
+      else
+        first = false;
+      disp += string(str);
+    }
   }
-  disp+="]\n";
+  disp += "]\n";
   return disp+"}";
 }
-bool Graph::Check() const {
-  return true;
-}
-
-
-// visited all dst nodes and then push current node into the stack
-void Graph::topology_sort_inner(SNode node,
-    map<string, bool> *visited,
-    std::stack<string> *stack) {
-  (*visited)[node->name()] = true;
-  const vector<SNode>& dstnodes=node->dstnodes();
-  for (auto it=dstnodes.rbegin();it!=dstnodes.rend();it++) {
-    if ((*visited)[(*it)->name()])
-      continue;
-    topology_sort_inner((*it),visited, stack);
-  }
-  stack->push(node->name());
-}
 
 // sort to make `bottom' nodes be placed in the front positions
 void Graph::Sort() {
-  SNode start=nullptr;
-  map<string, bool> visited;
-  for(auto node: nodes_){
-    if(node->srcnodes().size()==0){
-      CHECK(start==nullptr);
-      start=node;
+  // nodes to be visited
+  std::queue<Node*> visiting_nodes;
+  // visited node set
+  std::unordered_set<Node*> visited_set;
+  // visiting_nodes + visted_set
+  std::unordered_set<Node*> visit_set;;
+  for (auto node : nodes_) {
+    // visit nodes without source nodes firstly
+    if (node->srcnodes.size() == 0) {
+      visiting_nodes.push(node);
+      visit_set.insert(node);
     }
-    visited[node->name()]=false;
   }
-  int n=nodes_.size();
-  std::unordered_set<SNode> pushed;
-  std::queue<SNode> tmp;
-  tmp.push(start);
-  pushed.insert(start);
+  int n = nodes_.size();
   nodes_.clear();
-  while(!tmp.empty()){
-    auto node=tmp.front();
-    tmp.pop();
-    bool visit=true;
-    for(auto src: node->srcnodes())
-      if(visited[src->name()]==false){
-        visit=false;
+  while (!visiting_nodes.empty()) {
+    auto node = visiting_nodes.front();
+    visiting_nodes.pop();
+    bool visit = true;
+    for (auto src : node->srcnodes) {
+      // visit this node only if all srouce nodes have been visited
+      if (visited_set.find(src) == visited_set.end()) {
+        visit = false;
         break;
       }
-    if(visit){
+    }
+    if (visit) {
       nodes_.push_back(node);
-      visited[node->name()]=true;
-      for(auto dst: node->dstnodes()){
-        if(pushed.find(dst) == pushed.end()){
-          tmp.push(dst);
-          pushed.insert(dst);
+      visited_set.insert(node);
+      for (auto dst : node->dstnodes) {
+        // queueing the dst node if it is not queued before
+        if (visit_set.find(dst) == visit_set.end()) {
+          visiting_nodes.push(dst);
+          visit_set.insert(dst);
         }
       }
-    }else
-      tmp.push(node);
+    } else {
+      visiting_nodes.push(node);
+    }
   }
   CHECK_EQ(nodes_.size(), n);
 }
 
-
-SNode Graph::InsertSliceNode(SNode srcnode, const vector<SNode>& dstnodes,
-    const V& info, bool connect_dst){
-  V myinfo=info;
-  myinfo.origin="kSlice";
-  SNode node=AddNode("slice-"+srcnode->name(),myinfo);
-  AddEdge(srcnode, node);
-  if(connect_dst)
-    for(SNode dst: dstnodes)
-      AddEdge(node, dst);
-  return node;
-}
-SNode Graph::InsertConcateNode(const vector<SNode>&srcnodes, SNode dstnode,
-    const V& info){
-  V myinfo=info;
-  myinfo.origin="kConcate";
-  SNode node=AddNode("concate-"+dstnode->name(),myinfo);
-  AddEdge(node, dstnode);
-  for(SNode src: srcnodes)
-    AddEdge(src, node);
-  return node;
-}
-SNode Graph::InsertSplitNode(SNode srcnode, const vector<SNode>& dstnodes){
-  V myinfo=srcnode->val();
-  myinfo.origin="kSplit";
-  SNode node=AddNode("split-"+srcnode->name(), myinfo);
-  AddEdge(srcnode, node);
-  for(SNode dst: dstnodes)
-    AddEdge(node, dst);
-  return node;
-}
-std::pair<SNode, SNode> Graph::InsertBridgeNode(SNode srcnode, SNode dstnode){
-  LayerInfo info=srcnode->val();
-  info.origin="kBridgeSrc";
-  SNode src=AddNode("s-"+srcnode->name()+"-"+dstnode->name(), info);
-  info=dstnode->val();
-  info.origin="kBridgeDst";
-  SNode dst=AddNode("d-"+srcnode->name()+"-"+dstnode->name(), info);
-  AddEdge(srcnode, src);
-  AddEdge(src, dst);
-  AddEdge(dst, dstnode);
-  return pair<SNode, SNode>{src, dst};
-}
-
-
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 02d80a1..24a0541 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -27,8 +27,7 @@ void Param::AddSlice(int slice_id, int size){
     //must be added in order
     CHECK_EQ(slice_start_+num_slices_, slice_id);
     offset=slice_offset_.back()+slice_size_.back();
-  }
-  else{
+  } else {
     slice_start_=slice_id;
     offset=0;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index 80e3619..8e949ef 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -64,7 +64,7 @@ void SGDUpdater::Init(const UpdaterProto& proto){
   weight_decay_=proto.weight_decay();
 }
 
-void SGDUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+void SGDUpdater::Update(int step, Param* param, float grad_scale){
   Shape<1> s=Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
@@ -92,7 +92,7 @@ void NesterovUpdater::Init(const UpdaterProto& proto){
   weight_decay_=proto.weight_decay();
 }
 
-void NesterovUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+void NesterovUpdater::Update(int step, Param* param, float grad_scale){
   Shape<1> s=Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
@@ -118,7 +118,7 @@ void AdaGradUpdater::Init(const UpdaterProto& proto){
   weight_decay_=proto.weight_decay();
 }
 
-void AdaGradUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+void AdaGradUpdater::Update(int step, Param* param, float grad_scale){
   Shape<1> s=Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
@@ -143,7 +143,7 @@ void RMSPropUpdater::Init(const UpdaterProto& proto){
   weight_decay_=proto.weight_decay();
 }
 
-void RMSPropUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+void RMSPropUpdater::Update(int step, Param* param, float grad_scale){
   Shape<1> s=Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
@@ -166,7 +166,7 @@ void AdaDeltaUpdater::Init(const UpdaterProto& proto){
   weight_decay_=proto.weight_decay();
 }
 
-void AdaDeltaUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
   Shape<1> s=Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);


[3/3] incubator-singa git commit: SINGA-29 Update NeuralNet class to enable customizing layer partition type

Posted by wa...@apache.org.
SINGA-29 Update NeuralNet class to enable customizing layer partition type

1. Clean the code for NeuralNet and Graph classes.
   Graph class only provides functions about Node and Edge management, e.g., add, remove and toplogy sort.
   NeuralNet provides one function (CreateGraph) to convert net configuration into a Graph. Net partitioning
   is done in CreateGraph function. The CreateNetFromGraph function create and connect layers from the graph.
2. Users can customize the partition for whole net and for a specific layer through field partition_dim of LayerProto and NetProto.
   the configuration of LayerProto overwrites that of NetProto.

Tested on single process with non-distributed training, shared-memory hogwild, one worker group with 2 workers.
Tested with two processes for downpour and distributed hogwild. Downpour has similar performance as shared-memory hoglwild; while Distributed hogwild does not perform as good non-distributed training.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/9a6e09fa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/9a6e09fa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/9a6e09fa

Branch: refs/heads/master
Commit: 9a6e09fa2e56ea4c2563264c378ee9f3eb314acf
Parents: ea7cfea
Author: wang wei <wa...@comp.nus.edu.sg>
Authored: Sat Jul 11 14:31:54 2015 +0800
Committer: wang sheng <wa...@gmail.com>
Committed: Sun Jul 12 00:44:29 2015 +0800

----------------------------------------------------------------------
 include/neuralnet/base_layer.h | 493 +++++++++++-----------------
 include/neuralnet/layer.h      | 215 ++++---------
 include/neuralnet/neuralnet.h  | 152 ++++-----
 include/trainer/server.h       |   9 +-
 include/trainer/trainer.h      |  15 +-
 include/trainer/worker.h       |  23 +-
 include/utils/common.h         |  76 ++---
 include/utils/graph.h          | 201 +++++-------
 include/utils/param.h          |   2 +-
 include/utils/updater.h        |  12 +-
 src/neuralnet/base_layer.cc    | 191 +++++------
 src/neuralnet/layer.cc         | 435 ++++++++++++-------------
 src/neuralnet/neuralnet.cc     | 622 ++++++++++++++++--------------------
 src/proto/common.proto         |   7 +
 src/proto/model.proto          |  23 +-
 src/trainer/server.cc          |  12 +-
 src/trainer/trainer.cc         |  63 ++--
 src/trainer/worker.cc          | 110 +++----
 src/utils/common.cc            |  48 +++
 src/utils/graph.cc             | 294 +++++++++--------
 src/utils/param.cc             |   3 +-
 src/utils/updater.cc           |  10 +-
 22 files changed, 1313 insertions(+), 1703 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index 8b5b1bf..047e43d 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -1,14 +1,11 @@
-#ifndef INCLUDE_BASE_LAYER_H_
-#define INCLUDE_BASE_LAYER_H_
+#ifndef SINGA_NEURALNET_BASE_LAYER_H_
+#define SINGA_NEURALNET_BASE_LAYER_H_
 
 #include <vector>
 #include <string>
 #include <map>
-#include <functional>
 #include <utility>
 #include <memory>
-#include <chrono>
-#include <algorithm>
 #include <thread>
 
 #include "proto/model.pb.h"
@@ -17,162 +14,110 @@
 #include "utils/common.h"
 #include "utils/blob.h"
 
+namespace singa {
+
 using std::vector;
-using std::shared_ptr;
-using std::make_shared;
 using std::string;
 using std::map;
 
-namespace singa{
 
 class Layer;
-typedef shared_ptr<Layer> SLayer;
 /**
  * Base layer class.
- * Children should implement at least Layer::Setup, Layer::ComputeFeature(),
- * Layer::ComputGradient() functions for backpropagation method;
- * TODO(zhaojing) subclass the base layer class to support contrastive divergence,
- * The identifier of each layer is the literal string of the class name without
- * the suffix "Layer", which is used in layer registration and creation.
+ *
+ * Children should implement at least
+ * Layer::ComputeFeature() and Layer::ComputGradient()
+ * functions for contrastive-divergence/back-propagation algorithm.
  */
 class Layer {
  public:
-  Layer(){}
-  virtual ~Layer(){}
-  /**
-   * Layer initialization.
-   *
-   * It simply saves the proto configuation, most initializations are done by
-   * Setup().
-   *
-   * @param proto user defined layer configuration
-   */
-  virtual void Init(const LayerProto &proto);
-  /**
-   * Copy layer configuration from the other Layer, and use the shape argument
-   * to as its data shape.
-   */
-  void Init(const Layer& other, const vector<int>& shape);
-  /**
-   * TODO(wangsheng) Marshal layer properties and data into google protobuf
-   * object (i.e., snapshot).
-   *
-   * Parameters are marshalled separately into another object (i.e., model).
-   *
-   * @param layer_proto
-   * @param copyData if true marshal layer data, e.g., feature value
-   */
-  virtual void ToProto(LayerProto *layer_proto, bool copyData);
+  Layer() { }
+  virtual ~Layer() {}
   /**
    * Setup layer properties.
    *
    * Setup the shapes for data and parameters, also setup some properties
-   * based on the layer configuration and connected src layers.
+   * based on the layer configuration and connected layers.
    *
-   * @param srclayers layers connecting to this layer
-   */
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers)=0;
-  /**
-   * \copydoc Setup(const LayerProto&, const vector<SLayer>&)
+   * @param proto layer configuration.
+   * @param npartitions num of total partitions of the original layer. This
+   * layer should be setup as one partition.
    */
-  virtual void Setup();
+  virtual void Setup(const LayerProto& proto, int npartitions = 1);
+
   /**
-   * Setup the layer properties except shape.
+   * Compute features of this layer based on connected layers.
    *
-   * The shape is already set and passed in to set other properties.
-   * properties are set according to shapes of itself and connected layers, and
-   * configuration. this should not change the current shape_(
-   * shape check is done outside the function).
+   * @param phase kTrain, kTest, kPositive, etc.
    */
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers)=0;
+  virtual void ComputeFeature(Phase phase, Metric* perf) = 0;
   /**
-   * \copybrief SetupAfterPartition(const LayerProto&, const vector<int> &,
-   * const vector<SLayer>& ).
-   */
-  virtual void SetupAfterPartition();
-  /**
-   * Layers that have paramters must overload this function.
+   * Compute gradients for parameters and connected layers.
    *
-   * @return parameters associated with this layer
+   * @param phase kTrain, kTest, kPositive, etc.
    */
-  virtual vector<shared_ptr<Param>> GetParams(){
-    return vector<shared_ptr<Param>>();
-  }
+  virtual void ComputeGradient(Phase phase) = 0;
+
   /**
-   * Compute features of this layer based on connected layers.
-   *
-   * Implement forward propagation for BP.
-   * TODO(zhaojing) Implement both postive phase and negative phase for CD.
+   * For print debug info about each layer, e.g., norm of feature vector,
+   * norm of parameters.
    *
-   * @param training true if in training phase
-   * @param srclayers layers connecting to this layer
+   * @param step training/test/validation step
+   * @param phase forward/backward/positive/negative...
+   * @return debug info about this layer.
    */
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers)=0;
+  const string DebugString(int step, Phase phase);
   /**
-   * \copybrief ComputeFeature(const vector<SLayer>& srclayers)
-   */
-  virtual void ComputeFeature(Phase phase);
-  /**
-   * Compute gradients for parameters and connecting layers.
-   *
-   * Implement backward propagation for BP.
-   * TODO(zhaojing) Calculate gradients for parameters for CD.
+   * Layers that have paramters must override this function.
    *
-   * @param srclayers layers connecting to this layer.
-   */
-  virtual void ComputeGradient(const vector<SLayer>& srclayers)=0;
-  /**
-   * \copybrief ComputeGradient(const vector<SLayer>& srclayers)
+   * @return parameters associated with this layer
    */
-  virtual void ComputeGradient();
+  virtual const vector<Param*> GetParams() const {
+    return vector<Param*> {};
+  }
   /**
-   * Decide on which dimension to do the partitioning.
+   * Return the connection type between one neuron of this layer and
+   * its source layer.
+   * Currently support two connection types: kOneToOne, and kOneToAll.
+   * kOneToOne indicates the neuron depends on only one neuron from src layer.
+   * kOneToAll indicates the neuron depends on all neurons from src layer.
+   * TODO support kOneToMany.
    *
-   * @mode kLayer, kData, kNone (no partition)
-   * @return the partition dimension, -1 for no partition
+   * @param k index of source layer (current only support k = 0.
+   * @param connection type.
    */
-  virtual int partition_dimension() const {
-    int ret=0;
-    if(partition_type()==kLayerPartition)
-      ret= 1;
-    else if(partition_type()==kNone)
-      ret= -1;
-    return ret;
+  virtual ConnectionType src_neuron_connection(int k) const {
+    // CHECK_LT(k, srclayers_.size());
+    return kOneToOne;
   }
 
   /**
-   * Return connection type between two layers.
+   * Return the connection type of this layer and all dst layers.
    *
-   * Currently support two connections: kOneToOne, and kOneToAll.
-   * kOneToOne indicates the dst neuron depends on only one neuron from src
-   * layer. kOneToAll indicates the dst neuron depends on all neurons from src
-   * layer. TODO support kOneToMany.
+   * Currently support two connection types: kOneToOne, and kOneToMany.
+   * kOneToOne indicates the users implement the ComputeFeature and
+   * ComputeGradient function considering only one dest layer. In this case,
+   * a SplitLayer will be added automatically to connect this layer with all
+   * dest layer.
+   * kOneToMany indicates the users has already considered multiple dest layers
+   * in the implementation.
+   * @return connection type default is kOneToOne.
    */
-  virtual ConnectionType connection_type(int k) const {
-    CHECK_LT(k, srclayers_.size());
+  virtual ConnectionType dst_layer_connection() const {
     return kOneToOne;
   }
   /**
-   * @return partition type of this layer, e.g., kNone, kLayer or kData.
+   * @return partition dimension of this layer.
+   * -1 for no partition;
+   *  0 for partition the mini-batch into sub-mini-batch.
+   *  1 for partition the layer feature vector into sub-vector.
    */
-  virtual PartitionType partition_type() const {
-    return layer_proto_.partition_type();
+  virtual int partition_dim() const {
+    return layer_proto_.partition_dim();
   }
-  /**
-   * partition id is the ID of the layer in the original layer.
-   */
-  virtual void set_partitionid(int id){
-    layer_proto_.set_partitionid(id);
-  }
-  virtual int partitionid() const {
-    return layer_proto_.partitionid();
-  }
-  virtual void set_name(string name){
-    name_=name;
-    layer_proto_.set_name(name);
+
+  virtual int partition_id() const {
+    return layer_proto_.partition_id();
   }
   virtual int type() const {
     return layer_proto_.type();
@@ -187,22 +132,18 @@ class Layer {
    * @return name of src data blob, used by prefetch layer to locate the data
    * blob in parser layers; The default value is "unknown"; If the
    * src layer is the prefetch layer and there are more than one parser layers,
-   * this value value be set.
+   * this value be set.
    */
   const std::string &datablob() const {
     return layer_proto_.datablob();
   }
-  const vector<int>& shape(const Layer* layer) const{
-    return data(layer).shape();
-  }
-
   /**
    * @return a const ref for Blob storing neuron values of this layer for BP
    */
   virtual const Blob<float>& data(const Layer* from) const {
     return data_;
   }
-  virtual Blob<float>* mutable_data(const Layer* from){
+  virtual Blob<float>* mutable_data(const Layer* from) {
     return &data_;
   }
 
@@ -215,37 +156,36 @@ class Layer {
   virtual Blob<float>* mutable_grad(const Layer* from) {
     return &grad_;
   }
-
   /**
    * return LayerS that connected to this layer
    */
-  virtual const vector< SLayer> srclayers() const {
+  virtual const vector<Layer*> srclayers() const {
     return srclayers_;
   }
   /**
    * return LayerS that this layer connected to
    */
-  virtual const vector<SLayer> dstlayers() const {
+  virtual const vector<Layer*> dstlayers() const {
     return dstlayers_;
   }
 
-  virtual const int srclayers_size() const {
+  virtual int srclayers_size() const {
     return srclayers_.size();
   }
-  virtual const int dstlayers_size() const {
+  virtual int dstlayers_size() const {
     return dstlayers_.size();
   }
-  virtual void ClearDstLayers() {
+  virtual void clear_dstlayers() {
     dstlayers_.clear();
   }
-  virtual void ClearSrcLayers() {
+  virtual void clear_srclayers() {
     srclayers_.clear();
   }
 
-  virtual void AddSrcLayer(SLayer src){
+  virtual void add_srclayer(Layer* src) {
     srclayers_.push_back(src);
   }
-  virtual void AddDstLayer(SLayer dst){
+  virtual void add_dstlayer(Layer* dst) {
     dstlayers_.push_back(dst);
   }
 
@@ -264,11 +204,11 @@ class Layer {
   virtual bool is_bridgedstlayer() const {
     return false;
   }
-protected:
-  string name_;
-  Blob<float> data_, grad_;
+
+ protected:
   LayerProto layer_proto_;
-  vector<SLayer> srclayers_, dstlayers_;
+  Blob<float> data_, grad_;
+  vector<Layer*> srclayers_, dstlayers_;
 };
 
 /**
@@ -277,42 +217,44 @@ protected:
  */
 class BridgeSrcLayer: public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
+  using Layer::data;
+  using Layer::mutable_data;
+  using Layer::grad;
+  using Layer::mutable_grad;
+  using Layer::is_bridgesrclayer;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition();
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
+  void ComputeFeature(Phase phase, Metric* perf) override {}
+  void ComputeGradient(Phase phase) override {
+    ready_ = false;
+  }
 
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers);
-  virtual void ComputeGradient(const vector<SLayer>& srclayers);
-  virtual const Blob<float>& data(const Layer* from) const {
+  const Blob<float>& data(const Layer* from) const override {
     return srclayers_[0]->data(this);
   }
-  virtual Blob<float>* mutable_data(const Layer* from){
+  Blob<float>* mutable_data(const Layer* from) override {
     return srclayers_[0]->mutable_data(this);
   }
-
-  virtual const Blob<float>& grad(const Layer* from) const {
+  const Blob<float>& grad(const Layer* from) const override {
     return srclayers_[0]->grad(this);
   }
-  virtual Blob<float>* mutable_grad(const Layer* from) {
+  Blob<float>* mutable_grad(const Layer* from) override {
     return srclayers_[0]->mutable_grad(this);
   }
-  int dst_partition() const;
-  virtual bool is_bridgesrclayer() const {
+
+  bool is_bridgesrclayer() const override {
     return true;
   }
-  virtual void set_ready(bool a) {
-    ready_=a;
+  void set_ready(bool a) {
+    ready_ = a;
   }
-  virtual bool ready() const {
+  bool ready() const {
     return ready_;
   }
+
  protected:
+  //!< true if received grad from BridgeDstLayer
   bool ready_;
 };
 /**
@@ -321,30 +263,26 @@ class BridgeSrcLayer: public Layer {
  */
 class BridgeDstLayer: public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition();
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-    ready_=false;
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric* perf) override {
+    // reset ready_ for next iteration.
+    ready_ = false;
   }
-  virtual void ComputeGradient(const vector<SLayer>& srclayers){}
-  virtual bool is_bridgedstlayer() const {
+  void ComputeGradient(Phase phase) override {}
+  bool is_bridgedstlayer() const {
     return true;
   }
-  virtual void set_ready(bool a) {
-    ready_=a;
+  void set_ready(bool ready) {
+    ready_ = ready;
   }
-  virtual bool ready() const {
+  bool ready() const {
     return ready_;
   }
  protected:
+  //!< true if received data from BridgeSrcLayer
   bool ready_;
 };
 
@@ -353,71 +291,52 @@ class BridgeDstLayer: public Layer {
  */
 class ConcateLayer: public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition();
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric* perf) override;
+  void ComputeGradient(Phase phase) override;
 };
 
-
 /**
  * Base layer for reading records from local Shard, HDFS, lmdb, etc.
- * Cannot be partitioned, always returns kNone for partition type.
  */
-
 class DataLayer: public Layer{
  public:
-  using Layer::Setup;
-  using Layer::ComputeFeature;
   using Layer::ComputeGradient;
+  using Layer::mutable_data;
+  using Layer::mutable_grad;
+  using Layer::dst_layer_connection;
 
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers)=0;
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers)=0;
-  virtual bool is_datalayer() const {
+  void ComputeGradient(Phase phase) override {}
+  bool is_datalayer() const override {
     return true;
   }
-  virtual void ComputeGradient(const vector<SLayer>& srclayers){};
-  virtual const vector<Record>& records() const {
-    return records_;
-  }
-  virtual void Setup(){
-    vector<SLayer> dummy;
-    Setup(layer_proto_,dummy);
-    has_setup_=true;
+  Blob<float>* mutable_data(const Layer* layer) override {
+    return nullptr;
   }
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual void SetupAfterPartition(){
-    if(!has_setup_)
-    Setup();
+  Blob<float>* mutable_grad(const Layer* layer) override {
+    return nullptr;
   }
-  virtual PartitionType partition_type () const {
-    return kNone;
+  ConnectionType dst_layer_connection() const override {
+    return kOneToMany;
   }
 
-  virtual int batchsize() const=0;
+  int batchsize() const {
+    return batchsize_;
+  }
   virtual const Record& sample() const {
     return sample_;
   }
-
-  virtual Blob<float>* mutable_data(const Layer* layer) {
-    return nullptr;
-  }
-  virtual Blob<float>* mutable_grad(const Layer* layer) {
-    return nullptr;
+  /**
+   * @return the loaded records
+   */
+  virtual const vector<Record>& records() const {
+    return records_;
   }
+
  protected:
-  bool has_setup_;
   int random_skip_, batchsize_;
   Record sample_;
   vector<Record> records_;
@@ -432,36 +351,29 @@ class DataLayer: public Layer{
  */
 class PrefetchLayer : public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
-  using Layer::SetupAfterPartition;
 
-  virtual ~PrefetchLayer();
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers);
-  virtual void ComputeGradient(const vector<SLayer>& srclayers){};
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric* perf) override;
+  void ComputeGradient(Phase phase) override {};
 
-  virtual const Blob<float>& data(const Layer* from) const ;
-  virtual Blob<float>* mutable_data(const Layer* layer) ;
+  const Blob<float>& data(const Layer* from) const override;
+  Blob<float>* mutable_data(const Layer* layer) override;
 
-  virtual Blob<float>* mutable_grad(const Layer* layer){
+  Blob<float>* mutable_grad(const Layer* layer) override {
     return nullptr;
   }
-  virtual const Blob<float>& grad(const Layer* from) const {
-    CHECK(false)<<"Loss layer has not gradient blob";
+  const Blob<float>& grad(const Layer* from) const override {
+    CHECK(false) << "Loss layer has not gradient blob";
     return grad_;
   }
-  virtual PartitionType partition_type () const {
-    return kNone;
-  }
 
   void Prefetch(Phase phase);
+  virtual ~PrefetchLayer();
+
  protected:
-  vector<shared_ptr<Layer>> sublayers_;
+  vector<Layer*> sublayers_;
   map<string, Blob<float>> datablobs_;
   std::thread thread_;
 };
@@ -471,46 +383,46 @@ class PrefetchLayer : public Layer {
  */
 class SliceLayer: public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition();
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual const Blob<float>& data(const Layer* layer) const;
-  virtual const Blob<float>& grad(const Layer* layer) const;
-  virtual Blob<float>* mutable_data(const Layer* layer);
-  virtual Blob<float>* mutable_grad(const Layer* layer);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric* perf) override;
+  void ComputeGradient(Phase phase) override;
+  ConnectionType dst_layer_connection() const override {
+    return kOneToMany;
+  }
+  const Blob<float>& data(const Layer* layer) const override;
+  const Blob<float>& grad(const Layer* layer) const override;
+  Blob<float>* mutable_data(const Layer* layer) override;
+  Blob<float>* mutable_grad(const Layer* layer) override;
+
  protected:
   int SliceID(const Layer* layer) const;
+
+ private:
   vector<Blob<float>> datavec_, gradvec_;
   int slice_dim_, slice_num_;
 };
 
 /**
- * Replciate this layer into multiple dst layers
- * TODO change name to ReplicateLayer.
+ * Connect the source layer with multiple dst layers.
+ * Pass source layer's data blob directly to dst layers.
+ * Aggregate dst layer's gradients into source layer's gradient.
  */
 class SplitLayer: public Layer {
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition();
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric* perf) override;
+  void ComputeGradient(Phase phase) override;
+  ConnectionType dst_layer_connection() const override {
+    return kOneToMany;
+  }
+ protected:
+  Blob<float> grads_;
 };
 
 /**
@@ -518,28 +430,21 @@ class SplitLayer: public Layer {
  */
 class LossLayer: public Layer{
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
-
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers)=0;
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers)=0;
+  using Layer::mutable_grad;
+  using Layer::grad;
+  using Layer::is_losslayer;
 
-  virtual Blob<float>* mutable_grad(const Layer* layer){
+  Blob<float>* mutable_grad(const Layer* layer) override {
     return nullptr;
   }
-  virtual const Blob<float>& grad(const Layer* from) const {
-    CHECK(false)<<"Loss layer has not gradient blob";
+  const Blob<float>& grad(const Layer* from) const override {
+    CHECK(false) << "Loss layer has not gradient blob";
     return grad_;
   }
-  virtual bool is_losslayer() const {
+  bool is_losslayer() const override {
     return true;
   }
-  virtual const Blob<float>& metric() const {
-    return metric_;
-  }
+
  protected:
   Blob<float> metric_;
 };
@@ -549,56 +454,30 @@ class LossLayer: public Layer{
  */
 class ParserLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
+  using Layer::is_parserlayer;
+  using Layer::mutable_grad;
+  using Layer::grad;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers)=0;
+  void ComputeFeature(Phase phase, Metric* perf) override;
+  void ComputeGradient(Phase phase) override {};
   /**
    * Parse records from DataLayer into blob.
-   * This function is called by
-   * ComputeFeature(Phase, const vector<SLayer>& srclayers)  or Prefetch(Phase).
    */
   virtual void ParseRecords(Phase phase, const vector<Record>& records,
-      Blob<float>* blob)=0;
-
-  virtual bool is_parserlayer() const {
+      Blob<float>* blob) = 0;
+  bool is_parserlayer() const override {
     return true;
   }
-
-  virtual void ComputeFeature(Phase phase, const vector<SLayer>& srclayers);
-  /**
-   * Dummy function. ParserLayer does not compute gradients.
-   */
-  virtual void ComputeGradient(const vector<SLayer>& srclayers){};
-  virtual void Setup(){
-    Setup(layer_proto_,srclayers_);
-    has_setup_=true;
-  }
-  virtual void SetupAfterPartition(){
-    if(!has_setup_)
-      Setup();
-  }
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){}
-
-  virtual PartitionType partition_type () const{
-    return kNone;
-  }
-  virtual Blob<float>* mutable_grad(const Layer* layer) {
+  Blob<float>* mutable_grad(const Layer* layer) override {
     return nullptr;
   }
-  virtual const Blob<float>& grad(const Layer* from) const {
-    CHECK(false)<<"Parser layer has not gradient blob";
+  const Blob<float>& grad(const Layer* from) const  override {
+    CHECK(false) << "Parser layer has not gradient blob";
     return grad_;
   }
-
- private:
-  bool has_setup_;
 };
-} // singa
+}  // namespace singa
 
-#endif // INCLUDE_BASE_LAYER_H_
+#endif  // SINGA_NEURALNET_BASE_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index 48cffa2..b678e63 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -1,5 +1,7 @@
-#ifndef INCLUDE_NET_LAYER_H_
-#define INCLUDE_NET_LAYER_H_
+#ifndef SINGA_NEURALNET_LAYER_H_
+#define SINGA_NEURALNET_LAYER_H_
+
+#include <lmdb.h>
 
 #include <vector>
 #include <string>
@@ -9,13 +11,11 @@
 #include <memory>
 #include <chrono>
 #include <random>
-#include <lmdb.h>
 
 #include "proto/model.pb.h"
 #include "utils/data_shard.h"
 #include "neuralnet/base_layer.h"
 
-
 /**
  * \file this file includes the declarations neuron layer classes that conduct
  * the transformation of features.
@@ -27,54 +27,39 @@ namespace singa {
  */
 class ConvolutionLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-  /**
-   * need to reset some properties (e.g., weight matrix) according to
-   * shapes (after partition, e.g., partition is done against channel dimension)
-   */
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
-  virtual vector<shared_ptr<Param>> GetParams() {
-    return vector<shared_ptr<Param>>{weight_, bias_};
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
+  const vector<Param*> GetParams() const override {
+    vector<Param*> params{weight_, bias_};
+    return params;
   }
-  virtual ConnectionType connection_type(int k) const {
-    CHECK_LT(k, srclayers_.size());
+  ConnectionType src_neuron_connection(int k) const  override {
+    // CHECK_LT(k, srclayers_.size());
     return kOneToAll;
   }
+  ~ConvolutionLayer();
+
  protected:
-  int kernel_, pad_,  stride_ ;
-  int batchsize_,  channels_, height_,width_;
+  int kernel_, pad_,  stride_;
+  int batchsize_,  channels_, height_, width_;
   int col_height_, col_width_, conv_height_, conv_width_, num_filters_;
-  shared_ptr<Param> weight_, bias_;
+  Param* weight_, *bias_;
   Blob<float> col_data_, col_grad_;
 };
 
 class DropoutLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
  protected:
   // drop probability
   float pdrop_;
@@ -89,31 +74,23 @@ class DropoutLayer: public Layer {
   */
 class InnerProductLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 
-  /**
-   * need to reset weight matrix in case of LayerPartition
-   */
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-  virtual ConnectionType connection_type(int k) const {
-    CHECK_LT(k, srclayers_.size());
+
+  ConnectionType src_neuron_connection(int k) const override {
+    // CHECK_LT(k, srclayers_.size());
     return kOneToAll;
   }
-
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
-  //virtual void ToProto(LayerProto *layer_proto, bool copyData);
-  virtual vector<shared_ptr<Param>> GetParams() {
-    return vector<shared_ptr<Param>>{weight_, bias_};
+  const vector<Param*> GetParams() const override {
+    vector<Param*> params{weight_, bias_};
+    return params;
   }
+  ~InnerProductLayer();
 
  private:
   //! dimension of the hidden layer
@@ -121,16 +98,16 @@ class InnerProductLayer: public Layer {
   //! dimension of the visible layer
   int vdim_;
   int batchsize_;
-  shared_ptr<Param> weight_, bias_;
+  Param* weight_, *bias_;
 };
 
 class LabelLayer: public ParserLayer {
  public:
-  using ParserLayer::Setup;
+  using ParserLayer::ParseRecords;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void ParseRecords(Phase phase, const vector<Record>& records,
-      Blob<float>* blob);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ParseRecords(Phase phase, const vector<Record>& records,
+      Blob<float>* blob) override;
 };
 
 class LRNLayer: public Layer {
@@ -142,22 +119,13 @@ class LRNLayer: public Layer {
  * a_i, the activation (after ReLU) of a neuron convolved with the i-th kernel.
  * b_i, the neuron after normalization, N is the total num of kernels
  */
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
- public:
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
  protected:
   //! shape of the bottom layer feature
   int batchsize_, channels_, height_, width_;
@@ -170,11 +138,11 @@ class LRNLayer: public Layer {
 
 class MnistLayer: public ParserLayer {
  public:
-  using Layer::Setup;
+  using ParserLayer::ParseRecords;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void ParseRecords(Phase phase, const vector<Record>& records,
-      Blob<float>* blob);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ParseRecords(Phase phase, const vector<Record>& records,
+      Blob<float>* blob) override;
 
  protected:
   // height and width of the image after deformation
@@ -182,47 +150,34 @@ class MnistLayer: public ParserLayer {
   // n^2 images are processed as a batch for elastic distortion
   // conv height and conv width
   // gauss kernel values, displacements, column image and tmp buffer
-  //float* gauss_, *displacementx_, *displacementy_, *colimg_, *tmpimg_;
+  // float* gauss_, *displacementx_, *displacementy_, *colimg_, *tmpimg_;
   float  gamma_, beta_, sigma_, kernel_, alpha_, norm_a_, norm_b_;
   int resize_, elastic_freq_;
 };
 
 class PoolingLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
+
  protected:
   int kernel_, pad_, stride_;
-  int batchsize_,channels_, height_, width_, pooled_height_, pooled_width_;
+  int batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
   PoolingProto_PoolMethod pool_;
 };
 
 class ReLULayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions = 1) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 };
 
 
@@ -231,34 +186,26 @@ class SoftmaxLossLayer: public LossLayer {
    * connected from the label layer and the last fc layer
    */
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
   /**
    * softmax is not recommendeded for partition because it requires the whole
    * src layer for normalization.
    */
-  virtual PartitionType partition_type() const {
-    if(layer_proto_.partition_type()==kLayerPartition)
-      return kNone;
-    else
-      return layer_proto_.partition_type();
+  int partition_dim() const override {
+    CHECK_LE(layer_proto_.partition_dim(), 1);
+    return layer_proto_.partition_dim();
   }
-  virtual ConnectionType connection_type(int k) const {
-    CHECK_LT(k, srclayers_.size());
+  ConnectionType src_neuron_connection(int k) const override {
+    // CHECK_LT(k, srclayers_.size());
     return kOneToAll;
   }
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
  private:
   int batchsize_;
   int dim_;
@@ -268,11 +215,11 @@ class SoftmaxLossLayer: public LossLayer {
 
 class RGBImageLayer: public ParserLayer {
  public:
-  using Layer::Setup;
+  using ParserLayer::ParseRecords;
 
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual void ParseRecords(Phase phase, const vector<Record>& records,
-      Blob<float>* blob);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ParseRecords(Phase phase, const vector<Record>& records,
+      Blob<float>* blob) override;
 
  private:
   float scale_;
@@ -283,33 +230,21 @@ class RGBImageLayer: public ParserLayer {
 
 class ShardDataLayer: public DataLayer{
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
-  using Layer::ComputeGradient;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){};
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
-  virtual int batchsize() const {
-    return layer_proto_.sharddata_conf().batchsize();
-  }
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
  private:
   shared_ptr<DataShard> shard_;
 };
 class LMDBDataLayer: public DataLayer{
  public:
-  using Layer::Setup;
   using Layer::ComputeFeature;
-  using Layer::ComputeGradient;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){};
-  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
   void ConvertDatumToSingleLableImageRecord(const Datum& datum,
     SingleLabelImageRecord* record);
-  virtual int batchsize() const {
-    return layer_proto_.lmdbdata_conf().batchsize();
-  }
  private:
   MDB_env* mdb_env_;
   MDB_dbi mdb_dbi_;
@@ -325,21 +260,13 @@ class LMDBDataLayer: public DataLayer{
  */
 class TanhLayer: public Layer {
  public:
-  using Layer::Setup;
-  using Layer::SetupAfterPartition;
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  virtual void Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers);
-
-  virtual void SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers);
-
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
 
-  virtual void ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers);
-  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
  private:
   float outer_scale_, inner_scale_;
 };
@@ -347,4 +274,4 @@ class TanhLayer: public Layer {
 
 }  // namespace singa
 
-#endif  // INCLUDE_NET_LAYER_H_
+#endif  // SINGA_NEURALNET_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/neuralnet/neuralnet.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/neuralnet.h b/include/neuralnet/neuralnet.h
index ec6797c..2e19d0c 100644
--- a/include/neuralnet/neuralnet.h
+++ b/include/neuralnet/neuralnet.h
@@ -1,159 +1,119 @@
-#ifndef INCLUDE_NET_NET_H_
-#define INCLUDE_NET_NET_H_
+#ifndef SINGA_NEURALNET_NEURALNET_H_
+#define SINGA_NEURALNET_NEURALNET_H_
 
-#include <glog/logging.h>
 #include <vector>
 #include <map>
 #include <memory>
+#include <string>
 
 #include "proto/model.pb.h"
 #include "neuralnet/layer.h"
 #include "utils/factory.h"
 #include "utils/graph.h"
 
+namespace singa {
 using std::vector;
 using std::string;
 using std::map;
 using std::shared_ptr;
-namespace singa {
+
 /**
- * The neural network is constructed from user configured layers through google
- * protocol buffer. TODO support constructing neural network by adding layers
- * explicitly. E.g., users create layers and connect them manually in the code.
+ * The neural network is constructed from user configurations in NetProto.
  *
- * Some layers, e.g., SplitLayer and BridgeSrcLayer/BridgeDstLayer will be added
- * implicitly to partition the neural network.
+ * Some layers, e.g., SplitLayer and BridgeSrcLayer/BridgeDstLayer
+ * will be added implicitly to partition the neural network.
+ * TODO create wrappers for popular models, e.g., MLP, CNN.
  */
 class NeuralNet {
  public:
   /**
-   * Register Layers
+   * Register Layers, i.e., map layer type to layer class
    */
   static void RegisterLayers();
   /**
-   * Setup the neural network for training, test or validation.
+   * Create the neural network for training, test or validation.
    *
    * Parameters for test/validation net can share those from training after
    * setup (done outside of this funcion).
    *
-   * @param np proto for the neural network.
+   * @param np proto for the neural network
    * @param phase test/training/validation
-   * @param group_size partition the net among this num of workers
+   * @param num num of partitions, do partitioning if num > 1
+   * @return shared pointer to a neural net
    */
-  static shared_ptr<NeuralNet> SetupNeuralNet(const NetProto& np, Phase phase,
-      int group_size);
+  static shared_ptr<NeuralNet> Create(const NetProto& np, Phase phase, int num);
 
  public:
   /**
    * construct the net structure from protocol buffer.
+   * @param netproto neural net config
+   * @param npartitions num of partitions. 1 for no partitioning.
    */
-  NeuralNet(NetProto net_proto, int group_size=1);
+  explicit NeuralNet(NetProto netproto, int npartitions = 1);
+  ~NeuralNet();
   /**
-   * construct a json string representing the neuralnet graph.
-   * The json string can be used by other graph engine to draw a figure for
-   * displaying the neuralnet structure.
-   */
-  std::string ToString();
-  /**
-   * Print Norm1 of data and grad of each Layer and parameter.
-   * @param net, neural network
-   */
-  string DebugInfo();
-
-  /**
-   * to display the adjacency layers
+   * To display the adjacency layers
    */
   std::string ToAdjacency();
   /**
-   * Add layer explicitly used in manually programming/constructing neural net.
-   */
-  void AddLayer(const LayerProto &layer_proto){};
-  /**
-   * Add layer explicitly used in manually programming/constructing neural net.
-   */
-  void AddLayer(const Layer* layer){};
-  /**
-   * share weights from other neuralnet
+   * Share memory of parameter values from other neuralnet
    */
-  void ShareParams(shared_ptr<NeuralNet> other,int flag);
-  void ToProto(NetProto *net_proto, bool copyData=false);
-  const std::vector<shared_ptr<Layer>>& layers() {
+  void ShareParams(shared_ptr<NeuralNet> other);
+
+  const std::vector<Layer*>& layers() {
     return layers_;
   }
-  /**
-   * return ParserLayer of the neuralnet.
-   */
-  const std::vector<ParserLayer*>& parserlayers() {
-    if(parserlayers_.size()==0){
-      for(auto& layer: layers_)
-        if(layer->is_parserlayer())
-          parserlayers_.push_back(static_cast<ParserLayer*>(layer.get()));
-    }
+  const std::vector<ParserLayer*>& parserlayers() const {
     return parserlayers_;
   }
-  const std::vector<LossLayer*>& losslayers() {
-    if(losslayers_.size()==0){
-      for(auto& layer: layers_)
-        if(layer->is_losslayer())
-          losslayers_.push_back(static_cast<LossLayer*>(layer.get()));
-    }
+  const std::vector<LossLayer*>& losslayers() const {
     return losslayers_;
   }
-  const std::vector<DataLayer*>& datalayers() {
-    if(datalayers_.size()==0){
-      for(auto& layer: layers_)
-        if(layer->is_datalayer())
-          datalayers_.push_back(static_cast<DataLayer*>(layer.get()));
-    }
+  const std::vector<DataLayer*>& datalayers() const {
     return datalayers_;
   }
-  const std::vector<shared_ptr<Param>> &params()const {
+  const std::vector<Param*>& params() const {
     return params_;
   }
-  shared_ptr<Layer> name2layer(string name){
-    if (name2layer_.find(name)!=name2layer_.end())
-      return name2layer_[name];
-    else return nullptr;
+  Layer* name2layer(string name) const {
+    if (name2layer_.find(name) != name2layer_.end())
+      return name2layer_.at(name);
+    else
+      return nullptr;
   }
-
-  shared_ptr<Param> paramid2param(int id) {
-    if(paramid2param_.size()==0){
-      for(auto& layer: layers_){
-        for(shared_ptr<Param> p: layer->GetParams()){
-          paramid2param_[p->id()]=p;
-        }
-      }
-    }
-    return paramid2param_[id];
+  Param* paramid2param(int id) const {
+    return paramid2param_.at(id);
   }
 
  protected:
-  void ConstructNeuralNet(const NetProto &net_proto);
-  void PartitionNeuralNet();
-  map<string, shared_ptr<Layer>> GetNameToLayer(
-    const vector<shared_ptr<Layer>>& layers);
-  Graph CreatePartitonedGraph(const vector<shared_ptr<Layer>>& layers,
-    const map<string, shared_ptr<Layer>>& name2layer);
-
   /**
-   * Partition each layer according its partition type and dimension.
-   * @param layers original unpartitioned layers
+   * Create a neural net graph, one node for each layer.
+   *
+   * Partition the graph if npartitions > 1, each layer is sliced according to
+   * its own partition setting.
+   * @param netproto
+   * @npartitions
+   * @return neural net graph
+   */
+  Graph* CreateGraph(const NetProto& netproto, int npartitions);
+  /**
+   * Create neural net from graph, one layer per node.
+   */
+  void CreateNetFromGraph(Graph* graph, int npartitions);
+  /**
+   * prepare data structures, e.g., params_, layers_, etc.
    */
-  map<string, vector<shared_ptr<Layer>>> PartitionLayers(
-      const vector<shared_ptr<Layer>>& layers);
+  void PrepareDataStructures();
 
  protected:
-  vector<shared_ptr<Layer>> layers_;
+  vector<Layer*> layers_;
   vector<ParserLayer*> parserlayers_;
   vector<LossLayer*> losslayers_;
   vector<DataLayer*> datalayers_;
-  vector<shared_ptr<Param>> params_;
-  map<string, shared_ptr<Layer>> name2layer_;
-  map<int, shared_ptr<Param>> paramid2param_;
+  vector<Param*> params_;
 
-  map<string, LayerProto> name2layerproto_;
-  int group_size_;
-  Graph graph_;
+  map<string, Layer*> name2layer_;
+  map<int, Param*> paramid2param_;
 };
 }  // namespace singa
-#endif  // INCLUDE_NET_NET_H_
+#endif  // SINGA_NEURALNET_NEURALNET_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/trainer/server.h
----------------------------------------------------------------------
diff --git a/include/trainer/server.h b/include/trainer/server.h
index a8995fb..96a1437 100644
--- a/include/trainer/server.h
+++ b/include/trainer/server.h
@@ -9,7 +9,7 @@
 
 using std::shared_ptr;
 namespace singa {
-typedef std::unordered_map<int, shared_ptr<Param>> ServerShard;
+typedef std::unordered_map<int, Param*> ServerShard;
 /* Repsond to worker's get/put/udpate request, and periodically syncing with
   * other servers.
   *
@@ -24,6 +24,7 @@ class Server{
  public:
 
   Server(int thread_id, int group_id, int server_id);
+  virtual ~Server() {};
   void Setup(const UpdaterProto& proto, shared_ptr<ServerShard> shard,
       const vector<int>& slice2group);
   void Run();
@@ -41,14 +42,14 @@ class Server{
    *
    * @return the orignal message or response message
    */
-	virtual Msg* HandleGet(shared_ptr<Param> param, Msg** msg);
+	virtual Msg* HandleGet(Param* param, Msg** msg);
 
 	/**
 	 * Process Update request.
    *
    * @return the orignal message or response message
    */
-	virtual Msg* HandleUpdate(shared_ptr<Param> param, Msg** msg);
+	virtual Msg* HandleUpdate(Param* param, Msg** msg);
 
 	/**
 	 * Process PUT request.
@@ -61,7 +62,7 @@ class Server{
 	/**
    * TODO Process SYNC request.
 	 */
-	virtual Msg* HandleSyncRequest(shared_ptr<Param> param, Msg** msg);
+	virtual Msg* HandleSyncRequest(Param* param, Msg** msg);
 
  protected:
   int thread_id_,group_id_, server_id_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/trainer/trainer.h
----------------------------------------------------------------------
diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h
index c19a0ae..2419dc4 100644
--- a/include/trainer/trainer.h
+++ b/include/trainer/trainer.h
@@ -43,7 +43,7 @@ typedef struct HandleContext_{
   */
 class ParamInfo{
    public:
-  ParamInfo(shared_ptr<Param> p,int local, int owner):
+  ParamInfo(Param* p,int local, int owner):
     num_update(0), next_version(-1),num_local(local), num_total(1),
     owner_procs(owner){
       shares.push_back(p);
@@ -57,7 +57,7 @@ class ParamInfo{
     *  otherwise
     * @param owner the procs id of the worker who ownes this Param object
     */
-  void AddParam(shared_ptr<Param> p, bool local){
+  void AddParam(Param* p, bool local){
     num_local+=local;
     num_total+=1;
     if(local)
@@ -68,7 +68,7 @@ class ParamInfo{
   int num_local; //!< # local workers uses the shared parameter
   int num_total; //!< # total workers uses the shared parameter
   int owner_procs; //!< the procs id of the worker that owns the parameter
-  vector<shared_ptr<Param>> shares;
+  vector<Param*> shares;
 };
 
 typedef std::map<int, shared_ptr<ParamInfo>> WorkerShard;
@@ -95,13 +95,12 @@ class Trainer{
   // point.
 
  protected:
-  vector<shared_ptr<Server>> CreateServers(int nthread, const ModelProto& mproto,
+  vector<Server*> CreateServers(int nthread, const ModelProto& mproto,
       const vector<int> slices, vector<HandleContext*>* ctx);
-  vector<shared_ptr<Worker>> CreateWorkers(int nthread,
-      const ModelProto& mproto, vector<int> *slice_size);
+  vector<Worker*> CreateWorkers(int nthread, const ModelProto& mproto,
+      vector<int> *slice_size);
 
-  void Run(const vector<shared_ptr<Worker>>& workers,
-      const vector<shared_ptr<Server>>& servers);
+  void Run(const vector<Worker*>& workers, const vector<Server*>& servers);
   /**
    * Register default implementations for all base classes used in the system,
    * e.g., the Updater, BaseMsg, etc.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
index 04a68ea..3283ee9 100644
--- a/include/trainer/worker.h
+++ b/include/trainer/worker.h
@@ -19,7 +19,7 @@ const int kCollectSleepTime=5;//milliseconds;
 class Worker {
  public:
   Worker(int thread_id, int group_id, int worker_id);
-  ~Worker(){}
+  virtual ~Worker(){}
   void Setup(const ModelProto& model, shared_ptr<NeuralNet> train_net);
   void set_test_net(shared_ptr<NeuralNet> test_net){
     test_net_=test_net;
@@ -29,10 +29,10 @@ class Worker {
   }
 
   void Stop();
-  int Put(shared_ptr<Param> param, int step);
-  int Get(shared_ptr<Param> param, int step);
-  int Update(shared_ptr<Param> param, int step);
-  int Collect(shared_ptr<Param> param, int step);
+  int Put(Param* param, int step);
+  int Get(Param* param, int step);
+  int Update(Param* param, int step);
+  int Collect(Param* param, int step);
   int CollectAll(shared_ptr<NeuralNet> net, int step);
   /**
     * check validation/test firstly, then TrainOneBatch
@@ -49,7 +49,8 @@ class Worker {
   /**
    * Test/validate one mini-batch.
    */
-  virtual void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net, Metric* perf)=0;
+  virtual void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net,
+      Metric* perf)=0;
   /**
     * Test the perforance of the learned model on validation or test dataset.
     * Test is done by the first group.
@@ -77,7 +78,7 @@ class Worker {
   const bool DisplayDebugInfo(const int step) const {
     return DisplayNow(step)&&modelproto_.debug()&&group_id_==0;
   }
-  const void DisplayPerformance(const Metric & perf, const string& prefix);
+  void DisplayPerformance(const string& prefix, const Metric & perf);
 
   /**
    * return true if the stop condition is satisfied, e.g., the maximum number
@@ -142,9 +143,11 @@ class BPWorker: public Worker{
  public:
   BPWorker(int thread_id, int group_id, int worker_id);
   ~BPWorker(){}
-  virtual void TrainOneBatch(int step, Metric* perf);
-  virtual void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net, Metric* perf);
-  void Forward(int step, Phase phase, shared_ptr<NeuralNet> net);
+  void TrainOneBatch(int step, Metric* perf) override;
+  void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net,
+      Metric* perf) override;
+
+  void Forward(int step, Phase phase, shared_ptr<NeuralNet> net, Metric* perf);
   void Backward(int step, shared_ptr<NeuralNet> net);
 };
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/utils/common.h
----------------------------------------------------------------------
diff --git a/include/utils/common.h b/include/utils/common.h
index 619e06a..022a1dd 100644
--- a/include/utils/common.h
+++ b/include/utils/common.h
@@ -3,10 +3,11 @@
 
 #include <google/protobuf/message.h>
 #include <stdlib.h>
-#include <map>
+#include <unordered_map>
 #include <sstream>
 #include <string>
 #include <vector>
+#include "proto/common.pb.h"
 
 namespace singa {
 
@@ -40,54 +41,39 @@ inline float rand_real() {
 const std::string GetHostIP();
 void SetupLog(const std::string& workspace, const std::string& model);
 
+/**
+ * Performance mtrics.
+ */
 class Metric {
  public:
-  Metric() : counter_(0) {}
-  inline void AddMetric(const std::string& name, float value) {
-    std::string prefix = name;
-    if (name.find("@") != std::string::npos)
-      prefix = name.substr(0, name.find("@"));
-    if (data_.find(prefix) == data_.end())
-      data_[prefix] = value;
-    else
-      data_[prefix] += value;
-  }
-  inline void AddMetrics(const Metric& other) {
-    for (auto& entry : other.data_)
-      AddMetric(entry.first, entry.second);
-  }
-  inline void Reset() {
-    data_.clear();
-    counter_ = 0;
-  }
-  inline void Inc() { ++counter_; }
-  inline std::string ToString() const {
-    std::string disp = std::to_string(data_.size()) + " fields, ";
-    for (const auto& entry : data_) {
-      disp += entry.first + " : " + std::to_string(entry.second / counter_)
-              + "\t";
-    }
-    return disp;
-  }
-  inline void ParseString(const std::string& perf) {
-    std::stringstream stream(perf);
-    int n;
-    std::string str;
-    stream >> n >> str;
-    for (int i = 0; i < n; ++i) {
-      float f;
-      std::string sep;
-      stream >> str >> sep >> f;
-      data_[str] = f;
-    }
-    counter_ = 1;
-  }
-
+  /**
+   * Add one metric.
+   *
+   * If the metric exist, the aggregate. Otherwise create a new entry for it.
+   *
+   * @param name metric name, e.g., 'loss'
+   * @param value metric value
+   */
+  void Add(const std::string& name, float value);
+  /**
+   * reset all metric counter and value to 0
+   */
+  void Reset();
+  /**
+   * Generate a one line string for logging
+   */
+  const std::string ToLogString() const;
+  /**
+   * Serialize the object into a string
+   */
+  const std::string ToString() const;
+  /**
+   * Parse the metric from a string
+   */
+  void ParseFrom(const std::string& msg);
  private:
-  std::map<std::string, float> data_;
-  int counter_;
+  std::unordered_map<std::string, std::pair<int, float>> entry_;
 };
-
 }  // namespace singa
 
 #endif  // SINGA_UTILS_COMMON_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/utils/graph.h
----------------------------------------------------------------------
diff --git a/include/utils/graph.h b/include/utils/graph.h
index 93348dd..04e122d 100644
--- a/include/utils/graph.h
+++ b/include/utils/graph.h
@@ -1,150 +1,101 @@
-#ifndef INCLUDE_UTILS_GRAPH_H_
-#define INCLUDE_UTILS_GRAPH_H_
-#include <glog/logging.h>
+#ifndef SINGA_UTILS_GRAPH_H_
+#define SINGA_UTILS_GRAPH_H_
 #include <vector>
 #include <string>
 #include <map>
 #include <stack>
 #include <memory>
 
+namespace singa {
 using std::vector;
 using std::string;
 using std::map;
-using std::pair;
-using std::shared_ptr;
-using std::make_shared;
 
-
-typedef struct _LayerInfo{
-  // origin identifies the origin of this node, i.e., the corresponding layer
-  string origin;
-  //int locationid;// locationidation id;
-  int partitionid;
-  int slice_dimension;
-  int concate_dimension;
-}LayerInfo;
-typedef LayerInfo V;
-
-
-class Node;
-typedef shared_ptr<Node> SNode;
-
-class Node{
+class Node {
  public:
-  typedef shared_ptr<Node> SNode;
-  Node(string name): name_(name){}
-  Node(string name, const V& v):
-    name_(name), val_(v){}
+  /**
+   * Node constructor.
+   *
+   * @param name name of the corresponding layer
+   */
+  explicit Node(string name);
+  /**
+   * Node constructor.
+   *
+   * This node is a partition of some node.
+   * @param name node name
+   * @param origin  name of the original node
+   * @param id partition id of this node
+   * @param proto conf of the corresponding layer
+   */
+  Node(const string& name, const string& origin, int id, void* proto);
+  ~Node();
+  void AddDstNode(Node* dstnode);
+  void AddSrcNode(Node* srcnode);
+  void RemoveDstNode(Node* dst);
+  void RemoveSrcNode(Node* src);
 
-  void AddDstNode(SNode dstnode){
-    dstnodes_.push_back(dstnode);
-  }
-  void AddSrcNode(SNode srcnode){
-    srcnodes_.push_back(srcnode);
-  }
-
-  void RemoveDstNode(SNode dst){
-    auto iter=dstnodes_.begin();
-    while((*iter)->name_!=dst->name_&&iter!=dstnodes_.end()) iter++;
-    CHECK((*iter)->name_==dst->name_);
-    dstnodes_.erase(iter);
-  }
-  void RemoveSrcNode(SNode src){
-    auto iter=srcnodes_.begin();
-    while((*iter)->name_!=src->name_&&iter!=srcnodes_.end()) iter++;
-    CHECK((*iter)->name_==src->name_);
-    srcnodes_.erase(iter);
-  }
-  const string& name() const {return name_;}
-  const V& val() const {return val_;}
-  const SNode srcnodes(int k) const {return srcnodes_[k]; }
-  const SNode dstnodes(int k) const {return dstnodes_[k]; }
-  const vector<SNode>& srcnodes() const {return srcnodes_; }
-  const vector<SNode>& dstnodes() const {return dstnodes_; }
-  int  dstnodes_size() const {return dstnodes_.size(); }
-  int  srcnodes_size() const {return srcnodes_.size(); }
-
- private:
-  string name_;
-  vector<SNode> srcnodes_;
-  vector<SNode> dstnodes_;
+ public:
+  string name;
+  //! name of the origin node/layer from which is node is derived
+  string origin;
+  //! partition id
+  int partition_id;
+  //! proto of the corresponding layer
+  void* proto;
 
-  V val_;
-    // properties
-  string color_, weight_, shape_;
+  vector<Node*> srcnodes;
+  vector<Node*> dstnodes;
 };
 
-
 /**
- * For partition neuralnet and displaying the neuralnet structure
+ * Neuralnet is constructed by creating a graph with each node representing one
+ * layer at first. After topology sort for graph nodes, layers are created and
+ * connected.
  */
-class Graph{
+class Graph {
  public:
-  Graph(){}
-  void Sort();
-  const SNode& AddNode(string name, V origin){
-    nodes_.push_back(make_shared<Node>(name, origin));
-    name2node_[name]=nodes_.back();
-    return nodes_.back();
-  }
-  const SNode& AddNode(string name){
-    nodes_.push_back(make_shared<Node>(name));
-    name2node_[name]=nodes_.back();
-    return nodes_.back();
-  }
-
-  void AddEdge(SNode srcnode, SNode dstnode){
-    srcnode->AddDstNode(dstnode);
-    dstnode->AddSrcNode(srcnode);
-  }
-
-  void AddEdge(const string& src, const string& dst){
-    CHECK(name2node_.find(src)!=name2node_.end())<<"can't find src node "<<src;
-    CHECK(name2node_.find(dst)!=name2node_.end())<<"can't find dst node "<<dst;
-
-    SNode srcnode=name2node_[src], dstnode=name2node_[dst];
-    AddEdge(srcnode, dstnode);
-  }
-
-  void RemoveEdge(const string &src, const string& dst){
-    CHECK(name2node_.find(src)!=name2node_.end())<<"can't find src node "<<src;
-    CHECK(name2node_.find(dst)!=name2node_.end())<<"can't find dst node "<<dst;
-
-    SNode srcnode=name2node_[src], dstnode=name2node_[dst];
-    RemoveEdge(srcnode, dstnode);
-  }
-
-  void RemoveEdge(SNode src, SNode dst){
-    src->RemoveDstNode(dst);
-    dst->RemoveSrcNode(src);
-  }
-
-  const vector<SNode>& nodes() const{
+  Graph() {}
+  ~Graph();
+  /**
+   * @return all nodes of the graph
+   */
+  const vector<Node*>& nodes() const {
     return nodes_;
-  };
-
-  const SNode& node(string name) const{
-    CHECK(name2node_.find(name)!= name2node_.end())
-      <<"can't find dst node "<<name;
+  }
+  /**
+   * @param name node name
+   * @return return the node of given name
+   */
+  Node* node(const string& name) const {
     return name2node_.at(name);
   }
 
-  const string ToString() const;
-  const string ToString(const map<string, string>& info) const ;
-
-  bool Check() const;
-
-  SNode InsertSliceNode(SNode srcnode, const vector<SNode>& dstnodes,
-      const V& info, bool connect_dst=true);
-  SNode InsertConcateNode(const vector<SNode>&srcnodes, SNode dstnode,
-      const V& info);
-  SNode InsertSplitNode(SNode srcnode, const vector<SNode>& dstnodes);
-  std::pair<SNode, SNode> InsertBridgeNode(SNode srcnode, SNode dstnode);
-  void topology_sort_inner(SNode node, map<string, bool> *visited,
-    std::stack<string> *stack);
+  void AddNode(Node* node);
+  Node* AddNode(const string& name);
+  void AddEdge(Node* srcnode, Node* dstnode);
+  void AddEdge(const string& src, const string& dst);
+  void RemoveEdge(Node* src, Node* dst);
+  void RemoveEdge(const string &src, const string& dst);
+  /**
+   * Dump the graph into json string which can be used to draw a picture by
+   * graphviz
+   */
+  const string ToJson() const;
+  /**
+   * \copybreif ToJson()
+   *
+   * @param info info associated with each node
+   */
+  const string ToJson(const map<string, string>& info) const;
+  /**
+   * Do topology sort for all nodes of the graph.
+   */
+  void Sort();
 
  private:
-  vector<SNode> nodes_;
-  map<string, SNode> name2node_;
+  vector<Node*> nodes_;
+  map<string, Node*> name2node_;
 };
-#endif // INCLUDE_UTILS_GRAPH_H_
+}  // namespace singa
+#endif  // SINGA_UTILS_GRAPH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index 61e862b..781fdb6 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -115,7 +115,7 @@ class Param {
    *
    * @param other the Param object whose owner owns the data blob
    */
-  void ShareData(shared_ptr<Param> other){
+  void ShareData(Param* other){
     proto_.set_owner(other->owner());
     if(data_!=nullptr)
       CHECK(std::equal(data_->shape().begin(), data_->shape().end(),

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/include/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/utils/updater.h b/include/utils/updater.h
index 2a6dd43..0d408d8 100644
--- a/include/utils/updater.h
+++ b/include/utils/updater.h
@@ -12,7 +12,7 @@ class Updater{
   virtual void Init(const UpdaterProto &proto){
     proto_=proto;
   }
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f)=0;
+  virtual void Update(int step, Param* param, float grad_scale=1.0f)=0;
 
   float GetLearningRate(int step);
  protected:
@@ -21,7 +21,7 @@ class Updater{
 class SGDUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale=1.0f);
 
  protected:
   float base_lr_;
@@ -31,7 +31,7 @@ class SGDUpdater : public Updater{
 class NesterovUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale=1.0f);
 
  protected:
   float base_lr_;
@@ -41,7 +41,7 @@ class NesterovUpdater : public Updater{
 class AdaGradUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale=1.0f);
 
  protected:
   float base_lr_;
@@ -52,7 +52,7 @@ class AdaGradUpdater : public Updater{
 class RMSPropUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale=1.0f);
 
  protected:
   float base_lr_;
@@ -65,7 +65,7 @@ class RMSPropUpdater : public Updater{
 class AdaDeltaUpdater : public Updater{
  public:
   virtual void Init(const UpdaterProto& proto);
-  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+  virtual void Update(int step, Param* param, float grad_scale=1.0f);
 
  protected:
   float rho_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/neuralnet/base_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc
index 95628cb..e5fd822 100644
--- a/src/neuralnet/base_layer.cc
+++ b/src/neuralnet/base_layer.cc
@@ -9,76 +9,46 @@
 #include "neuralnet/base_layer.h"
 
 namespace singa {
-/********* Implementation for Layer **************/
-void Layer::Init(const LayerProto &proto) {
-  layer_proto_=proto;
-}
-
-void Layer::Init(const Layer& other, const vector<int>& shape){
-  data_.Reshape(shape);
-  grad_.Reshape(shape);
-  layer_proto_=other.layer_proto_;
-}
-void Layer::Setup(){
-  Setup(layer_proto_, srclayers_);
-}
-void Layer::SetupAfterPartition(){
-  vector<int> shape=data_.shape();
-  SetupAfterPartition(layer_proto_, shape, srclayers_);
-  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
-  CHECK(std::equal(shape.begin(), shape.end(), data_.shape().begin()))<<name()
-    <<IntVecToString(shape)<<"--"<<IntVecToString(data_.shape());
-}
-void Layer::ComputeFeature(Phase phase){
-  ComputeFeature(phase, srclayers_);
-}
-void Layer::ComputeGradient(){
-  ComputeGradient(srclayers_);
-}
-
-void Layer::ToProto(LayerProto *proto, bool copyData) {
-}
-
-/********* Implementation for BridgeSrcLayer **************/
-void BridgeSrcLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  grad_.ReshapeLike(data_);
-}
-void BridgeSrcLayer::SetupAfterPartition(){
-  Setup(layer_proto_, srclayers_);
-  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
-}
 
-void BridgeSrcLayer::ComputeFeature(Phase phase,
-    const vector<SLayer>& srclayers){
-}
-void BridgeSrcLayer::ComputeGradient(const vector<SLayer>& srclayers){
+void Layer::Setup(const LayerProto& proto, int npartitions) {
+  CHECK_GE(npartitions, 1);
+  layer_proto_ = proto;
+}
+
+const string Layer::DebugString(int step, Phase phase) {
+  string ret =StringPrintf("Layer %10s ", name().c_str());
+  if(data_.count() != 0)
+    return ret;
+  if(phase == kForward) {
+    ret += StringPrintf("data %10s data norm1 %13.9f", data_.asum_data());
+  }else if(phase == kBackward) {
+    ret += StringPrintf("grad norm1 %13.9f\n", grad_.asum_data());
+    for(Param* p: GetParams())
+      ret += StringPrintf("param id %2d, name %10s,\
+          value norm1 %13.9f, grad norm1 %13.9f\n",
+          p->id(), p->name().c_str(),
+          p->data().asum_data(), p->grad().asum_data());
+  }
+  return ret;
 }
-
 /********* Implementation for BridgeDstLayer **************/
-void BridgeDstLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  data_.Reshape(srclayers[0]->data(this).shape());
+void BridgeDstLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
+  data_.Reshape(srclayers_[0]->data(this).shape());
   grad_.ReshapeLike(data_);
 }
-void BridgeDstLayer::SetupAfterPartition(){
-  Setup(layer_proto_, srclayers_);
-  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
-}
-
 
 /************* Implementation for ConcateLayer ***********/
-void ConcateLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  size_t concate_dim=proto.concate_conf().concate_dimension();
+void ConcateLayer::Setup(const LayerProto& proto, int npartitions) {
+  // CHECK_EQ(npartitions, 1);
+  Layer::Setup(proto, npartitions);
+  size_t concate_dim=proto.concate_conf().concate_dim();
   CHECK_GE(concate_dim,0);
-  CHECK_GT(srclayers.size(),1);
-  vector<int> shape=srclayers[0]->data(this).shape();
-  for(size_t i=1;i<srclayers.size();i++){
-    const vector<int>& srcshape=srclayers[i]->data(this).shape();
+  CHECK_GT(srclayers_.size(),1);
+  vector<int> shape=srclayers_[0]->data(this).shape();
+  for(size_t i=1;i<srclayers_.size();i++){
+    const vector<int>& srcshape=srclayers_[i]->data(this).shape();
     for(size_t j=0;j<shape.size();j++)
       if(j==concate_dim)
         shape[j]+=srcshape[j];
@@ -89,19 +59,18 @@ void ConcateLayer::Setup(const LayerProto& proto,
   grad_.Reshape(shape);
 }
 
-void ConcateLayer::SetupAfterPartition(){
-  Setup(layer_proto_, srclayers_);
-//  LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+void ConcateLayer::ComputeFeature(Phase phase, Metric *perf){
+  LOG(FATAL) << "Not implemented for Concate Layer";
 }
 
-void ConcateLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){}
-
-void ConcateLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){}
+void ConcateLayer::ComputeGradient(Phase phase){
+  LOG(FATAL) << "Not implemented for Concate Layer";
+}
 
 /************* Implementation for ParserLayer ***********/
-void ParserLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  auto datalayer=static_cast<DataLayer*>(srclayers.begin()->get());
+void ParserLayer::ComputeFeature(Phase phase, Metric *perf){
+  CHECK_EQ(srclayers_.size(),1);
+  auto datalayer=static_cast<DataLayer*>(*srclayers_.begin());
   ParseRecords(phase, datalayer->records(), &data_);
 }
 
@@ -109,12 +78,11 @@ void ParserLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
 void PrefetchLayer::Prefetch(Phase phase){
   //clock_t s=clock();
   for(auto layer: sublayers_)
-    layer->ComputeFeature(phase);
+    layer->ComputeFeature(phase, nullptr);
   //LOG(ERROR)<<(clock()-s)*1.0/CLOCKS_PER_SEC;
 }
 
-void PrefetchLayer::ComputeFeature(Phase phase,
-    const vector<SLayer>& srclayers){
+void PrefetchLayer::ComputeFeature(Phase phase, Metric* perf){
   if(thread_.joinable())
     thread_.join();
   else{
@@ -128,27 +96,27 @@ void PrefetchLayer::ComputeFeature(Phase phase,
   thread_=std::thread(&PrefetchLayer::Prefetch, this, phase);
 }
 
-void PrefetchLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
+void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  // CHECK_EQ(npartitions, 1);
   Factory<Layer>* factory=Singleton<Factory<Layer>>::Instance();
   const auto& sublayers=proto.prefetch_conf().sublayers();
   CHECK_GE(sublayers.size(), 1);
-  map<string, SLayer> layers;
+  map<string, Layer*> layers;
   for(auto const &p:sublayers){
-    auto layer=shared_ptr<Layer>(factory->Create(p.type()));
-    layer->Init(p);
+    auto layer=factory->Create(p.type());
     sublayers_.push_back(layer);
     layers[p.name()]= layer;
   }
   // TODO topology sort layers
   auto layer=sublayers_.begin();
-  for(auto const &p:sublayers){
-    std::vector<SLayer> src;
+  for(auto const &p : sublayers){
+    std::vector<Layer*> src;
     for(auto const &srcname: p.srclayers()){
       src.push_back(layers[srcname]);
-      (*layer)->AddSrcLayer(layers[srcname]);
+      (*layer)->add_srclayer(layers[srcname]);
     }
-    (*layer)->Setup(p, src);
+    (*layer)->Setup(p);
     layer++;
   }
   for(auto layer: sublayers_)
@@ -177,15 +145,18 @@ Blob<float>* PrefetchLayer::mutable_data(const Layer* from) {
 PrefetchLayer::~PrefetchLayer(){
   if(thread_.joinable())
     thread_.join();
+  for(auto layer : sublayers_)
+    delete layer;
 }
 /************* Implementation for SliceLayer****************/
-void SliceLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  slice_dim_=proto.slice_conf().slice_dimension();
-  slice_num_=proto.slice_conf().slice_num();
+void SliceLayer::Setup(const LayerProto& proto, int npartitions){
+  // CHECK_EQ(npartitions, 1);
+  Layer::Setup(proto, npartitions);
+  slice_dim_=proto.slice_conf().slice_dim();
+  slice_num_= npartitions;
   CHECK_GE(slice_dim_,0);
   CHECK_EQ(slice_num_, dstlayers_.size());
-  data_.Reshape(srclayers[0]->data(this).shape());
+  data_.Reshape(srclayers_[0]->data(this).shape());
   grad_.ReshapeLike(data_);
   datavec_.resize(slice_num_);
   gradvec_.resize(slice_num_);
@@ -201,17 +172,11 @@ void SliceLayer::Setup(const LayerProto& proto,
   }
 }
 
-void SliceLayer::SetupAfterPartition(){
-  Setup(layer_proto_, srclayers_);
-  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
-}
-
-
 int SliceLayer::SliceID(const Layer* layer) const {
   CHECK(layer!= nullptr);
   for(size_t i=0;i<datavec_.size();i++){
     //LOG(ERROR)<<"get slice "<<IntVecToString(shapes_[i]);
-    if(dstlayers_[i].get() == layer)
+    if(dstlayers_[i] == layer)
       return i;
   }
   CHECK(false);
@@ -238,11 +203,10 @@ Blob<float>* SliceLayer::mutable_grad(const Layer* layer){
     return &grad_;
   return &gradvec_[SliceID(layer)];
 }
-void SliceLayer::ComputeFeature(Phase phase,
-    const vector<shared_ptr<Layer>>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
+void SliceLayer::ComputeFeature(Phase phase, Metric *perf) {
+  CHECK_EQ(srclayers_.size(),1);
   if(slice_dim_==0){
-    const auto& blob=srclayers.at(0)->data(this);
+    const auto& blob=srclayers_.at(0)->data(this);
     int size=blob.count()/slice_num_;
     for(int i=0;i<slice_num_;i++){
       float* dst=datavec_[i].mutable_cpu_data();
@@ -251,27 +215,26 @@ void SliceLayer::ComputeFeature(Phase phase,
     }
   }
 }
-void SliceLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){
-
-}
-
-void SplitLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  grad_.Reshape(srclayers[0]->data(this).shape());
+void SliceLayer::ComputeGradient(Phase phase) {
+  // LOG(FATAL) << "Not implemented";
 }
 
 /************* Implementation for SplitLayer****************/
-void SplitLayer::SetupAfterPartition(){
-  Setup(layer_proto_, srclayers_);
-  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
-}
-void SplitLayer::ComputeFeature(Phase phase, const vector<shared_ptr<Layer>>& srclayers){
+void SplitLayer::Setup(const LayerProto& proto, int npartitions) {
+  // CHECK_EQ(npartitions, 1);
+  Layer::Setup(proto, npartitions);
 
+  CHECK_EQ(srclayers_.size(),1);
+  data_.Reshape(srclayers_[0]->data(this).shape());
+  grad_.Reshape(srclayers_[0]->data(this).shape());
 }
-void SplitLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){
 
+void SplitLayer::ComputeFeature(Phase phase, Metric *perf) {
+  LOG(FATAL) << "Not implemented";
+
+}
+void SplitLayer::ComputeGradient(Phase phase) {
+  LOG(FATAL) << "Not implemented";
 }
 
 }  // namespace singa


[2/3] incubator-singa git commit: SINGA-29 Update NeuralNet class to enable customizing layer partition type

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index db13824..b40b676 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -13,18 +13,47 @@ using namespace mshadow;
 using namespace mshadow::expr;
 
 namespace singa {
+inline Tensor<cpu, 4> Tensor4(Blob<float>* blob) {
+  const vector<int>& shape = blob->shape();
+  Tensor<cpu, 4> tensor(blob->mutable_cpu_data(),
+      Shape4(shape[0], shape[1], shape[2], shape[3]));
+  return tensor;
+}
+
+inline Tensor<cpu, 3> Tensor3(Blob<float>* blob){
+  const vector<int>& shape = blob->shape();
+  Tensor<cpu, 3> tensor(blob->mutable_cpu_data(),
+      Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
+  return tensor;
+}
+inline Tensor<cpu, 2> Tensor2(Blob<float>* blob){
+  const vector<int>& shape = blob->shape();
+  Tensor<cpu, 2> tensor(blob->mutable_cpu_data(),
+      Shape2(shape[0], blob->count() / shape[0]));
+  return tensor;
+}
+inline Tensor<cpu, 1> Tensor1(Blob<float>* blob){
+  Tensor<cpu, 1> tensor(blob->mutable_cpu_data(), Shape1(blob->count()));
+  return tensor;
+}
 
 /************ Implementation for ConvProductLayer*************************/
-void ConvolutionLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
+ConvolutionLayer::~ConvolutionLayer() {
+  delete weight_;
+  delete bias_;
+}
+void ConvolutionLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
   ConvolutionProto conv_conf=proto.convolution_conf();
   kernel_=conv_conf.kernel();
   CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
   pad_=conv_conf.pad();
   stride_=conv_conf.stride();
   num_filters_=conv_conf.num_filters();
-  const vector<int>& srcshape=srclayers[0]->data(this).shape();
+  if(partition_dim() > 0)
+    num_filters_ /= npartitions;
+
+  const vector<int>& srcshape=srclayers_[0]->data(this).shape();
   int dim=srcshape.size();
   CHECK_GT(dim, 2);
   width_=srcshape[dim-1];
@@ -45,32 +74,18 @@ void ConvolutionLayer::Setup(const LayerProto& proto,
   col_grad_.Reshape(vector<int>{col_height_, col_width_});
 
   Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
-  weight_=shared_ptr<Param>(factory->Create("Param"));
+  weight_ = factory->Create("Param");
   weight_->Setup(proto.param(0), vector<int>{num_filters_, col_height_});
-  bias_=shared_ptr<Param>(factory->Create("Param"));
+  bias_ = factory->Create("Param");
   bias_->Setup(proto.param(1), vector<int>{num_filters_});
 }
 
-void ConvolutionLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  LayerProto newproto(proto);
-  ConvolutionProto *conv_conf=newproto.mutable_convolution_conf();
-  conv_conf->set_num_filters(shape[1]);
-  Setup(newproto, srclayers);
-}
-
-void ConvolutionLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape4(batchsize_, channels_, height_, width_));
-  Tensor<cpu, 3> data(data_.mutable_cpu_data(),
-      Shape3(batchsize_, num_filters_, conv_height_* conv_width_));
-  Tensor<cpu, 2> col(col_data_.mutable_cpu_data(),
-      Shape2(col_height_, col_width_));
-  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(),
-      Shape2(num_filters_, col_height_));
-  Tensor<cpu, 1> bias(bias_->mutable_cpu_data(),
-      Shape1(num_filters_));
+void ConvolutionLayer::ComputeFeature(Phase phase, Metric* perf){
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto data = Tensor3(&data_);
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
 
   for(int n=0;n<batchsize_;n++){
     if(pad_>0)
@@ -82,144 +97,126 @@ void ConvolutionLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclaye
   data+=broadcast<1>(bias, data.shape);
 }
 
-void ConvolutionLayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape4(batchsize_, channels_, height_, width_));
-  Tensor<cpu, 2> col(col_data_.mutable_cpu_data(),
-      Shape2(col_height_, col_width_));
-  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(),
-      Shape2(num_filters_, col_height_));
+void ConvolutionLayer::ComputeGradient(Phase phase) {
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+
+  auto grad = Tensor3(&grad_);
+  auto gcol = Tensor2(&col_grad_);
+  auto gweight = Tensor2(weight_->mutable_grad());
+  auto gbias = Tensor1(bias_->mutable_grad());
 
-  Blob<float>* gsrcblob=srclayers[0]->mutable_grad(this);
+  Blob<float>* gsrcblob=srclayers_[0]->mutable_grad(this);
   Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
   if(gsrcblob!=nullptr)
     gsrc.dptr=gsrcblob->mutable_cpu_data();
-  Tensor<cpu, 3> grad(grad_.mutable_cpu_data(),
-      Shape3(batchsize_, num_filters_, conv_height_* conv_width_));
-  Tensor<cpu, 2> gcol(col_grad_.mutable_cpu_data(),
-      Shape2(col_height_, col_width_));
-  Tensor<cpu, 2> gweight(weight_->mutable_cpu_grad(),
-      Shape2(num_filters_, col_height_));
-  Tensor<cpu, 1> gbias(bias_->mutable_cpu_grad(),
-      Shape1(num_filters_));
-
-  gweight=0.0f;
   gbias=sumall_except_dim<1>(grad);
-  Shape<3> padshape(gsrc.shape.SubShape());
-  padshape[0]+=2*pad_;padshape[1]+=2*pad_;
-  Shape<2> imgshape=Shape2(height_, width_);
+
+  gweight = 0.0f;
+  Shape<3> padshp(gsrc.shape.SubShape());
+  padshp[0] += 2 * pad_;
+  padshp[1] += 2 * pad_;
+  Shape<2> imgshp = Shape2(height_, width_);
   for(int n=0;n<batchsize_;n++){
     if(pad_>0)
       col=unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
     else
       col=unpack_patch2col(src[n], kernel_, stride_);
-    gweight+=dot(grad[n], col.T());
+    gweight += dot(grad[n], col.T());
 
     if(gsrcblob!=nullptr){
-      gcol=dot(weight.T(), grad[n]);
-      gsrc[n]=crop(pack_col2patch(gcol, padshape, kernel_, stride_), imgshape);
+      gcol = dot(weight.T(), grad[n]);
+      gsrc[n] = crop(pack_col2patch(gcol, padshp, kernel_, stride_), imgshp);
     }
   }
 }
 
 /****************** Implementation for DropoutLayer ***********************/
-void DropoutLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
-  mask_.Reshape(srclayers[0]->data(this).shape());
-  pdrop_=proto.dropout_conf().dropout_ratio();
-}
-
-void DropoutLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
+void DropoutLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  data_.ReshapeLike(srclayers_[0]->data(this));
+  grad_.ReshapeLike(*srclayers_[0]->mutable_grad(this));
+  mask_.Reshape(srclayers_[0]->data(this).shape());
+  pdrop_ = proto.dropout_conf().dropout_ratio();
 }
 
-void DropoutLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers) {
+void DropoutLayer::ComputeFeature(Phase phase, Metric* perf) {
   // check training
-  if(phase!= kTrain){//!training){
-    data_.CopyFrom(srclayers[0]->data(this));
+  if(phase != kTrain){//!training){
+    data_.CopyFrom(srclayers_[0]->data(this));
     return;
   }
   float pkeep=1-pdrop_;
-  Tensor<cpu, 1> mask(mask_.mutable_cpu_data(), Shape1(mask_.count()));
+  auto mask = Tensor1(&mask_);
   mask = F<op::threshold>(TSingleton<Random<cpu>>::Instance()\
       ->uniform(mask.shape), pkeep ) * (1.0f/pkeep);
-  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
-  Blob<float>* srcblob=srclayers[0]->mutable_data(this);
-  Tensor<cpu, 1> src(srcblob->mutable_cpu_data(), Shape1(srcblob->count()));
-  data=src*mask;
-}
-
-void DropoutLayer::ComputeGradient(const vector<SLayer>& srclayers)  {
-  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(data_.count()));
-  Tensor<cpu, 1> mask(mask_.mutable_cpu_data(), Shape1(mask_.count()));
-  Blob<float>* gsrcblob=srclayers[0]->mutable_grad(this);
-  Tensor<cpu, 1> gsrc(gsrcblob->mutable_cpu_data(), Shape1(gsrcblob->count()));
-  gsrc=grad*mask;
-}
-/**************** Implementation for InnerProductLayer********************/
-void InnerProductLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  const auto& src=srclayers[0]->data(this);
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  data = src * mask;
+}
+
+void DropoutLayer::ComputeGradient(Phase phase)  {
+  auto mask = Tensor1(&mask_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  gsrc = grad * mask;
+}
+
+/*********** Implementation for InnerProductLayer**********/
+InnerProductLayer::~InnerProductLayer() {
+  delete weight_;
+  delete bias_;
+}
+void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(), 1);
+  const auto& src=srclayers_[0]->data(this);
   batchsize_=src.shape()[0];
   vdim_=src.count()/batchsize_;
   hdim_=proto.innerproduct_conf().num_output();
+  if(partition_dim()>0)
+    hdim_ /= npartitions;
   data_.Reshape(vector<int>{batchsize_, hdim_});
   grad_.ReshapeLike(data_);
   Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
-  weight_=shared_ptr<Param>(factory->Create("Param"));
-  bias_=shared_ptr<Param>(factory->Create("Param"));
+  weight_ = factory->Create("Param");
+  bias_ = factory->Create("Param");
   weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
   bias_->Setup(proto.param(1), vector<int>{hdim_});
 }
-void InnerProductLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  LayerProto newproto(proto);
-  InnerProductProto * innerproto=newproto.mutable_innerproduct_conf();
-  innerproto->set_num_output(shape[1]);
-  Setup(newproto, srclayers);
-}
-
-void InnerProductLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers) {
-  Tensor<cpu, 2> data(data_.mutable_cpu_data(), Shape2(batchsize_,hdim_));
-  CHECK_EQ(srclayers[0]->data(this).count(), batchsize_*vdim_);
-  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape2(batchsize_,vdim_));
-  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(), Shape2(vdim_,hdim_));
-  Tensor<cpu, 1> bias(bias_->mutable_cpu_data(), Shape1(hdim_));
+
+void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
+  auto data = Tensor2(&data_);
+  auto src = Tensor2(srclayers_[0]->mutable_data(this));
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
   data=dot(src, weight);
   // repmat: repeat bias vector into batchsize rows
   data+=repmat(bias, batchsize_);
 }
 
-void InnerProductLayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape2(batchsize_,vdim_));
-  Tensor<cpu, 2> grad(grad_.mutable_cpu_data(),Shape2(batchsize_,hdim_));
-  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(), Shape2(vdim_,hdim_));
-  Tensor<cpu, 2> gweight(weight_->mutable_cpu_grad(), Shape2(vdim_,hdim_));
-  Tensor<cpu, 1> gbias(bias_->mutable_cpu_grad(), Shape1(hdim_));
+void InnerProductLayer::ComputeGradient(Phase phas) {
+  auto src = Tensor2(srclayers_[0]->mutable_data(this));
+  auto grad = Tensor2(&grad_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto gweight = Tensor2(weight_->mutable_grad());
+  auto gbias = Tensor1(bias_->mutable_grad());
 
   gbias=sum_rows(grad);
   gweight=dot(src.T(), grad);
-  if(srclayers[0]->mutable_grad(this)!=nullptr){
-    Tensor<cpu, 2> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
-        Shape2(batchsize_,vdim_));
+  if(srclayers_[0]->mutable_grad(this)!=nullptr){
+    auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this));
     gsrc=dot(grad, weight.T());
   }
 }
 /*****************************************************************************
  * Implementation for LabelLayer
  *****************************************************************************/
-void LabelLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
+void LabelLayer::Setup(const LayerProto& proto, int npartitions){
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
+  int batchsize=static_cast<DataLayer*>(srclayers_[0])->batchsize();
   data_.Reshape(vector<int>{batchsize});
 }
 
@@ -236,7 +233,7 @@ void LabelLayer::ParseRecords(Phase phase, const vector<Record>& records,
 
 
 /*********************LMDBDataLayer**********************************/
-void LMDBDataLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
+void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf){
   if(random_skip_){
     int nskip=rand()%random_skip_;
     int n=0;
@@ -296,8 +293,8 @@ void LMDBDataLayer::ConvertDatumToSingleLableImageRecord(const Datum& datum,
   }
 }
 
-void LMDBDataLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
+void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
   CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed";
   CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS); // 1TB
   CHECK_EQ(mdb_env_open(mdb_env_,
@@ -325,21 +322,23 @@ void LMDBDataLayer::Setup(const LayerProto& proto,
   ConvertDatumToSingleLableImageRecord(datum, record);
 
   batchsize_=batchsize();
+  if(partition_dim() == 0)
+    batchsize_ /= npartitions;
   records_.resize(batchsize_);
   random_skip_=proto.lmdbdata_conf().random_skip();
 }
 
 /***************** Implementation for LRNLayer *************************/
-void LRNLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
+void LRNLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
   lsize_ = proto.lrn_conf().local_size();
   CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
   knorm_=proto.lrn_conf().knorm();
   alpha_ = proto.lrn_conf().alpha();
   beta_ = proto.lrn_conf().beta();
 
-  const vector<int>& s=srclayers[0]->data(this).shape();
+  const vector<int>& s=srclayers_[0]->data(this).shape();
   data_.Reshape(s);
   grad_.Reshape(s);
   norm_.Reshape(s);
@@ -349,30 +348,22 @@ void LRNLayer::Setup(const LayerProto& proto,
   width_=s[3];
 }
 
-void LRNLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
-}
-
-void LRNLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
+void LRNLayer::ComputeFeature(Phase phase, Metric* perf) {
   const float salpha = alpha_ / lsize_;
-  Shape<4> s=Shape4(batchsize_,channels_, height_, width_);
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
-  Tensor<cpu, 4> data(data_.mutable_cpu_data(), s);
-  Tensor<cpu, 4> norm(norm_.mutable_cpu_data(), s);
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto data = Tensor4(&data_);
+  auto norm = Tensor4(&norm_);
   // stores normalizer without power
   norm= chpool<red::sum>( F<op::square>(src) , lsize_ ) * salpha + knorm_;
   data = src * F<op::power>(norm, -beta_ );
 }
 
-void LRNLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+void LRNLayer::ComputeGradient(Phase phase) {
   const float salpha = alpha_ / lsize_;
-  Shape<4> s=Shape4(batchsize_,channels_, height_, width_);
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
-  Tensor<cpu, 4> norm(norm_.mutable_cpu_data(), s);
-  Tensor<cpu, 4> grad(grad_.mutable_cpu_data(), s);
-  Tensor<cpu, 4> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(), s);
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto norm = Tensor4(&norm_);
+  auto grad = Tensor4(&grad_);
+  auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this));
 
   gsrc = grad * F<op::power>( norm, -beta_ );
   gsrc += ( - 2.0f * beta_ * salpha ) * chpool<red::sum>(
@@ -448,11 +439,11 @@ void MnistLayer::ParseRecords(Phase phase,
   }
   CHECK_EQ(dptr, blob->mutable_cpu_data()+blob->count());
 }
-void MnistLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
-  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
-  Record sample=static_cast<DataLayer*>(srclayers[0].get())->sample();
+void MnistLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
+  int batchsize=static_cast<DataLayer*>(srclayers_[0])->batchsize();
+  Record sample=static_cast<DataLayer*>(srclayers_[0])->sample();
   kernel_=proto.mnist_conf().kernel();
   sigma_=proto.mnist_conf().sigma();
   alpha_=proto.mnist_conf().alpha();
@@ -475,9 +466,9 @@ void MnistLayer::Setup(const LayerProto& proto,
 }
 
 /******************** Implementation for PoolingLayer******************/
-void PoolingLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
+void PoolingLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
   PoolingProto pool_conf = proto.pooling_conf();
   kernel_=pool_conf.kernel();
   stride_=pool_conf.stride();
@@ -487,7 +478,7 @@ void PoolingLayer::Setup(const LayerProto& proto,
         || pool_ == PoolingProto_PoolMethod_MAX)
       << "Padding implemented only for average and max pooling.";
 
-  const auto& srcshape=srclayers[0]->data(this).shape();
+  const auto& srcshape=srclayers_[0]->data(this).shape();
   int dim=srcshape.size();
   CHECK_GT(dim,2);
   width_ = srcshape[dim-1];
@@ -503,68 +494,49 @@ void PoolingLayer::Setup(const LayerProto& proto,
   grad_.ReshapeLike(data_);
 }
 
-void PoolingLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
-}
-
-void PoolingLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape4(batchsize_, channels_, height_, width_));
-  Tensor<cpu, 4> data(data_.mutable_cpu_data(),
-      Shape4(batchsize_, channels_, pooled_height_, pooled_width_));
+void PoolingLayer::ComputeFeature(Phase phase, Metric* perf) {
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto data = Tensor4(&data_);
   if(pool_ == PoolingProto_PoolMethod_MAX)
     data=pool<red::maximum>(src, kernel_, stride_);
   else if(pool_ == PoolingProto_PoolMethod_AVE)
-    data=pool<red::sum>(src, kernel_, stride_)
-      *(1.0f/(kernel_*kernel_));
+    data=pool<red::sum>(src, kernel_, stride_) *(1.0f/(kernel_*kernel_));
 }
 
 /*
  * partition only on num/channel dim
  * assume grad and data have the same paritition
  */
-void PoolingLayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  Shape<4> s1= Shape4(batchsize_, channels_, height_, width_);
-  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),s1);
-  Tensor<cpu, 4> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),s1);
-  Shape<4> s2= Shape4(batchsize_, channels_, pooled_height_, pooled_width_);
-  Tensor<cpu, 4> data(data_.mutable_cpu_data(), s2);
-  Tensor<cpu, 4> grad(grad_.mutable_cpu_data(), s2);
+void PoolingLayer::ComputeGradient(Phase phase) {
+  auto src = Tensor4(srclayers_[0]->mutable_data(this));
+  auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this));
+  auto data = Tensor4(&data_);
+  auto grad = Tensor4(&grad_);
   if(pool_ == PoolingProto_PoolMethod_MAX)
-      gsrc = unpool<red::maximum>(src, data, grad, kernel_, stride_);
+    gsrc = unpool<red::maximum>(src, data, grad, kernel_, stride_);
   else if(pool_ == PoolingProto_PoolMethod_AVE)
-      gsrc = unpool<red::sum>(src, data, grad, kernel_, stride_)
-        *(1.0f/(kernel_*kernel_));
+    gsrc = unpool<red::sum>(src, data, grad, kernel_, stride_)
+      *(1.0f/(kernel_*kernel_));
 }
 
 /***************** Implementation for ReLULayer *****************************/
 
-void ReLULayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
-}
-
-void ReLULayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
+void ReLULayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  data_.ReshapeLike(srclayers_[0]->data(this));
+  grad_.ReshapeLike(*(srclayers_[0]->mutable_grad(this)));
 }
 
-void ReLULayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
-  Tensor<cpu, 1> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape1(data_.count()));
+void ReLULayer::ComputeFeature(Phase phase, Metric* perf) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers_[0]->mutable_data(this));
   data=F<op::relu>(src);
 }
 
-void ReLULayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(grad_.count()));
-  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
-  Tensor<cpu, 1> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
-      Shape1(data_.count()));
+void ReLULayer::ComputeGradient(Phase phase) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
   gsrc=F<op::relu_grad>(data)*grad;
 }
 
@@ -573,7 +545,7 @@ void ReLULayer::ComputeGradient(const vector<SLayer>& srclayers) {
 void RGBImageLayer::ParseRecords(Phase phase,
     const vector<Record>& records, Blob<float>* blob){
   const vector<int>& s=blob->shape();
-  Tensor<cpu, 4> images(data_.mutable_cpu_data(), Shape4(s[0],s[1],s[2],s[3]));
+  auto images = Tensor4(&data_);
   const SingleLabelImageRecord& r=records.at(0).image();
   Tensor<cpu, 3> raw_image(Shape3(r.shape(0),r.shape(1),r.shape(2)));
   AllocSpace(raw_image);
@@ -625,14 +597,14 @@ void RGBImageLayer::ParseRecords(Phase phase,
   if(cropsize_)
     FreeSpace(croped_image);
 }
-void RGBImageLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),1);
+void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) {
+  ParserLayer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),1);
   scale_=proto.rgbimage_conf().scale();
   cropsize_=proto.rgbimage_conf().cropsize();
   mirror_=proto.rgbimage_conf().mirror();
-  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
-  Record sample=static_cast<DataLayer*>(srclayers[0].get())->sample();
+  int batchsize=static_cast<DataLayer*>(srclayers_[0])->batchsize();
+  Record sample=static_cast<DataLayer*>(srclayers_[0])->sample();
   vector<int> shape;
   shape.push_back(batchsize);
   for(int x: sample.image().shape()){
@@ -663,7 +635,7 @@ void RGBImageLayer::Setup(const LayerProto& proto,
 }
 
 /***************Implementation for ShardDataLayer**************************/
-void ShardDataLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
+void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){
   if(random_skip_){
     int nskip=rand()%random_skip_;
     LOG(INFO)<<"Random Skip "<<nskip<<" records, there are "<<shard_->Count()
@@ -683,67 +655,55 @@ void ShardDataLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers
   }
 }
 
-void ShardDataLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
+void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
   shard_= std::make_shared<DataShard>(proto.sharddata_conf().path(),
       DataShard::kRead);
   string key;
   shard_->Next(&key, &sample_);
   batchsize_=proto.sharddata_conf().batchsize();
+  if(partition_dim() == 0)
+    batchsize_ /= npartitions;
 
   records_.resize(batchsize_);
   random_skip_=proto.sharddata_conf().random_skip();
 }
 /*******************Implementation of TanLayer***************************/
-void TanhLayer::Setup(const LayerProto& proto,
-      const vector<SLayer>& srclayers){
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
+void TanhLayer::Setup(const LayerProto& proto, int npartitions){
+  Layer::Setup(proto, npartitions);
+  data_.ReshapeLike(srclayers_[0]->data(this));
+  grad_.ReshapeLike(srclayers_[0]->grad(this));
 }
 
-void TanhLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
-}
-
-
-void TanhLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers){
-  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
-  Tensor<cpu, 1> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-      Shape1(data_.count()));
+void TanhLayer::ComputeFeature(Phase phase, Metric* perf) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers_[0]->mutable_data(this));
   data=F<op::stanh>(src);
 }
 
-void TanhLayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
-  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(grad_.count()));
-  Tensor<cpu, 1> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
-      Shape1(data_.count()));
+void TanhLayer::ComputeGradient(Phase phase) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
   gsrc=F<op::stanh_grad>(data)*grad;
 }
 /********** * Implementation for SoftmaxLossLayer*************************/
-void SoftmaxLossLayer::Setup(const LayerProto& proto,
-    const vector<SLayer>& srclayers){
-  CHECK_EQ(srclayers.size(),2);
-  data_.Reshape(srclayers[0]->data(this).shape());
+void SoftmaxLossLayer::Setup(const LayerProto& proto, int npartitions) {
+  LossLayer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(),2);
+  data_.Reshape(srclayers_[0]->data(this).shape());
   batchsize_=data_.shape()[0];
   dim_=data_.count()/batchsize_;
   topk_=proto.softmaxloss_conf().topk();
   metric_.Reshape(vector<int>{2});
   scale_=proto.softmaxloss_conf().scale();
 }
-void SoftmaxLossLayer::SetupAfterPartition(const LayerProto& proto,
-      const vector<int> &shape,
-      const vector<SLayer>& srclayers){
-  Setup(proto, srclayers);
-}
-void SoftmaxLossLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclayers) {
+void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) {
   Shape<2> s=Shape2(batchsize_, dim_);
   Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
-  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
+  Tensor<cpu, 2> src(srclayers_[0]->mutable_data(this)->mutable_cpu_data(), s);
   Softmax(prob, src);
-  const float* label=srclayers[1]->data(this).cpu_data();
+  const float* label=srclayers_[1]->data(this).cpu_data();
   const float* probptr=prob.dptr;
   float loss=0, precision=0;
   for(int n=0;n<batchsize_;n++){
@@ -769,14 +729,13 @@ void SoftmaxLossLayer::ComputeFeature(Phase phase, const vector<SLayer>& srclaye
     probptr+=dim_;
   }
   CHECK_EQ(probptr, prob.dptr+prob.shape.Size());
-  float *metric=metric_.mutable_cpu_data();
-  metric[0]=loss*scale_/(1.0f*batchsize_);
-  metric[1]=precision*scale_/(1.0f*batchsize_);
+  perf->Add("loss", loss*scale_/(1.0f*batchsize_));
+  perf->Add("accuracy", precision*scale_/(1.0f*batchsize_));
 }
 
-void SoftmaxLossLayer::ComputeGradient(const vector<SLayer>& srclayers) {
-  const float* label=srclayers[1]->data(this).cpu_data();
-  Blob<float>* gsrcblob=srclayers[0]->mutable_grad(this);
+void SoftmaxLossLayer::ComputeGradient(Phase phase) {
+  const float* label=srclayers_[1]->data(this).cpu_data();
+  Blob<float>* gsrcblob=srclayers_[0]->mutable_grad(this);
   gsrcblob->CopyFrom(data_);
   float* gsrcptr=gsrcblob->mutable_cpu_data();
   for(int n=0;n<batchsize_;n++){

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 2240499..6d82734 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -1,22 +1,19 @@
 #include <algorithm>
 #include <queue>
 
-#include "proto/model.pb.h"
 #include "neuralnet/neuralnet.h"
 #include "utils/singleton.h"
-#include "utils/factory.h"
-#include "utils/graph.h"
-#include "utils/cluster.h"
 
 namespace singa {
 #define LayerT(x) LayerProto_LayerType_k##x
 
 #define RegisterLayer(factory, id) \
-  factory->Register(LayerProto_LayerType_k##id,\
+  factory->Register(LayerProto_LayerType_k##id, \
       CreateInstance(id##Layer, Layer))
 
-void NeuralNet::RegisterLayers(){
-  Factory<Layer>* factory=Singleton<Factory<Layer>>::Instance();
+void NeuralNet::RegisterLayers() {
+  Factory<Layer>* factory = Singleton<Factory<Layer>>::Instance();
+  // FooLayer's type is kFoo, register using Foo
   RegisterLayer(factory, BridgeDst);
   RegisterLayer(factory, BridgeSrc);
   RegisterLayer(factory, Convolution);
@@ -37,402 +34,329 @@ void NeuralNet::RegisterLayers(){
   RegisterLayer(factory, Split);
   RegisterLayer(factory, Tanh);
 }
-shared_ptr<NeuralNet> NeuralNet::SetupNeuralNet(const NetProto& np, Phase phase,
-    int group_size){
+
+shared_ptr<NeuralNet> NeuralNet::Create(
+    const NetProto& conf,
+    Phase phase,
+    int npartitions) {
   NetProto proto;
-  proto.set_partition_type(np.partition_type());
-  // exclude layers if necessary
-  for(auto& layer:np.layer()){
-    bool include=true;
-    for(int x: layer.exclude()){
-      if(x==phase)
-        include=false;
+  proto.CopyFrom(conf);
+  proto.clear_layer();
+  // exclude layers according to phase
+  for (const auto& layer : conf.layer()) {
+    bool include = true;
+    for (auto x : layer.exclude()) {
+      if (x == phase)
+        include = false;
     }
-    if(include){
-      LayerProto* lp=proto.add_layer();
+    if (include) {
+      LayerProto* lp = proto.add_layer();
       lp->CopyFrom(layer);
+      // using net partition if layer partition is not set
+      if (!lp->has_partition_dim())
+        lp->set_partition_dim(proto.partition_dim());
     }
   }
-  LOG(INFO)<<"NeuralNet config is "<<proto.DebugString();
-  return make_shared<NeuralNet>(proto, group_size);
-}
-NeuralNet::NeuralNet(NetProto net_proto, int group_size) {
-  group_size_=group_size;
-  for(int i=0;i<net_proto.layer_size();i++){
-    LayerProto * layer_proto=net_proto.mutable_layer(i);
-    if(!layer_proto->has_partition_type())
-      layer_proto->set_partition_type(net_proto.partition_type());
-  }
+  LOG(INFO) << "NeuralNet config is\n" << proto.DebugString();
 
-  LOG(INFO)<<"Construct Neural Net...";
-  ConstructNeuralNet(net_proto);
-  {
-    string vis_folder=Cluster::Get()->vis_folder();
-    std::ofstream fout(vis_folder+"/nopartition.json", std::ofstream::out);
-    fout<<ToString();
-    fout.flush();
-    fout.close();
-  }
-  if(group_size_>1){
-    PartitionNeuralNet();
-    string vis_folder=Cluster::Get()->vis_folder();
-    std::ofstream fout(vis_folder+"/partition.json", std::ofstream::out);
-    fout<<ToString();
-    fout.flush();
-    fout.close();
-  }
-  for(auto layer: layers_){
-    DLOG(INFO)<<layer->name();
-  }
-  for(auto& layer: layers_){
-    for(shared_ptr<Param> p: layer->GetParams()){
-      params_.push_back(p);
-    }
-  }
-  LOG(INFO)<<"Neural Net constructed";
-  // init all data members to avoid conflicts from multi-thread access
-  losslayers();
-  paramid2param(0);
-  datalayers();
-  parserlayers();
+  // TODO(wangwei) create net based on net type, e.g., directed, undirected, etc
+  auto net = std::make_shared<NeuralNet>(proto, npartitions);
+  return net;
 }
 
-void NeuralNet::ConstructNeuralNet(const NetProto& net_proto){
-  // construct graph, one node for one layer, identified by layer name
-  map<string, LayerProto> protos;
-  for (auto &layer_proto : net_proto.layer()){
-    graph_.AddNode(layer_proto.name());
-    protos[layer_proto.name()]=layer_proto;
-  }
-  for (auto &layer_proto : net_proto.layer())
-    if(layer_proto.srclayers_size())
-      for(const string& src: layer_proto.srclayers())
-        graph_.AddEdge(src, layer_proto.name());
+NeuralNet::~NeuralNet() {
+  for (auto layer : layers_)
+    delete layer;
+}
 
-  // topology sort
-  graph_.Sort();
-  //LOG(ERROR)<<"pure graph without partition\n"<< graph_.ToString();
+NeuralNet::NeuralNet(NetProto netproto, int npartitions) {
+  LOG(INFO) << "Constructing Neural Net...";
+  auto graph = CreateGraph(netproto, npartitions);
+  CreateNetFromGraph(graph, npartitions);
+  PrepareDataStructures();
+  for (Node* node : graph->nodes())
+    delete static_cast<LayerProto*>(node->proto);
+  delete graph;
+  LOG(INFO) << "Neural net constructed";
+}
 
-  auto* factory=Singleton<Factory<Layer>>::Instance();
-  // create Layers according to topology order
-  for(SNode node: graph_.nodes()){
-    shared_ptr<Layer> layer(factory->Create(protos[node->name()].type()));
-    layer->Init(protos[node->name()]);
-    name2layer_[node->name()]=layer;
+void NeuralNet::CreateNetFromGraph(Graph* graph, int npartitions) {
+  auto* factory = Singleton<Factory<Layer>>::Instance();
+  // create one layer per node
+  for (Node* node : graph->nodes()) {
+    auto layer = factory->Create(static_cast<LayerProto*>(node->proto)->type());
     layers_.push_back(layer);
+    name2layer_[node->name] = layer;
   }
-
-  // connect Layers.
-  for(SNode node: graph_.nodes()){
-    auto layer=name2layer_[node->name()];
-    for(SNode dst: node->dstnodes())
-      layer->AddDstLayer(name2layer_[dst->name()]);
-    for(SNode src: node->srcnodes())
-      layer->AddSrcLayer(name2layer_[src->name()]);
+  // connect layers
+  for (Node* node : graph->nodes()) {
+    auto layer = name2layer_[node->name];
+    layer->clear_dstlayers();
+    for (Node* dst : node->dstnodes)
+      layer->add_dstlayer(name2layer_[dst->name]);
+    layer->clear_srclayers();
+    for (Node* src : node->srcnodes)
+      layer->add_srclayer(name2layer_[src->name]);
   }
-  // setup layer properties, e.g., shapes
-  int paramid=0;
-  for(auto& layer: layers_){
-      layer->Setup();
-      for(auto param: layer->GetParams())
-        param->set_id(paramid++);
+  // setup layers
+  int paramid = 0;
+  map<string, string> layerinfo;
+  map<string, vector<Layer*>> share_param_layers;
+  for (Node* node : graph->nodes()) {
+    auto layer = name2layer_[node->name];
+    layer->Setup(*(static_cast<LayerProto*>(node->proto)), npartitions);
+    layerinfo[layer->name()] = IntVecToString(layer->data(nullptr).shape());
+    for (auto param : layer->GetParams())
+      param->set_id(paramid++);
+    if (layer->partition_dim() == 0)
+      share_param_layers[node->origin].push_back(layer);
   }
-  LOG(INFO)<<"network graph witout partition\n"<<ToString();
-}
-
-void NeuralNet::PartitionNeuralNet(){
-  graph_=CreatePartitonedGraph(layers_, name2layer_);
-  //DLOG(ERROR)<<"pure graph after partition\n"<<graph_.ToString();
-  map<string, shared_ptr<Layer>> name2layer(name2layer_);
-  map<string, vector<shared_ptr<Layer>>> share_conf_layers;
-  name2layer_.clear();
-  layers_.clear();
-  int gsize=group_size_;
-  auto* factory=Singleton<Factory<Layer>>::Instance();
-  // create Layers according to topology order
-  for(SNode node: graph_.nodes()){
-    LayerProto proto;
-    proto.set_name(node->name());
-    proto.set_partitionid(node->val().partitionid);
-    string origin=node->val().origin;
-    if (origin=="kSlice"){
-      proto.set_type(LayerT(Slice));
-      SliceProto *slice=proto.mutable_slice_conf();
-      slice->set_slice_dimension(node->val().slice_dimension);
-      slice->set_slice_num(node->dstnodes().size());
-    }else if(origin== "kConcate"){
-      proto.set_type(LayerT(Concate));
-      ConcateProto *concate=proto.mutable_concate_conf();
-      concate->set_concate_dimension(node->val().concate_dimension);
-      concate->set_concate_num(node->srcnodes().size());
-    }else if(origin=="kSplit"){
-      proto.set_type(LayerT(Split));
-      SplitProto *split=proto.mutable_split_conf();
-      split->set_num_splits(node->dstnodes().size());
-    }else if(origin=="kBridgeSrc"){
-      proto.set_type(LayerT(BridgeSrc));
-    }else if(origin =="kBridgeDst"){
-      proto.set_type(LayerT(BridgeDst));
-    }else{
-      CHECK(name2layer.find(node->val().origin)!=name2layer_.end())
-        <<"Unkown origin for node "<<node->val().origin;
+  LOG(INFO) << "Neural net structure\n"  << graph->ToJson(layerinfo);
+  // share Params for layers generated from the same origin layer
+  for (auto & entry : share_param_layers) {
+    auto owner = entry.second.begin();
+    auto owner_params = (*owner)->GetParams();
+    for (auto it = owner + 1; it != entry.second.end(); it++) {
+      auto params = (*it)->GetParams();
+      CHECK_EQ(params.size(), owner_params.size());
+      for (size_t i = 0; i < params.size(); i++)
+        params.at(i)->ShareData(owner_params.at(i));
     }
-    shared_ptr<Layer> newlayer;
-    if(proto.has_type()){
-      // layers added due to partition
-      shared_ptr<Layer> layer(factory->Create(proto.type()));
-      layer->Init(proto);
-      newlayer=layer;
-    }else{
-      // partitioned layers from origin neuralnet
-      auto oldlayer=name2layer.at(node->val().origin);
-      vector<int> shape=oldlayer->shape(nullptr);
-      if(oldlayer->partition_type()==kNone){
-        newlayer=oldlayer;
-      } else{
-        int pdim=oldlayer->partition_dimension();
-        shape[pdim]=shape[pdim]/gsize+
-          ((node->val().partitionid==gsize-1)?shape[pdim]%gsize:0);
-        shared_ptr<Layer> layer(factory->Create(oldlayer->type()));
-        layer->Init(*oldlayer, shape);
-        layer->set_name(node->name());
-        newlayer=layer;
-        if(oldlayer->partition_type()==kDataPartition)
-          share_conf_layers[node->val().origin].push_back(newlayer);
-      }
-      newlayer->set_partitionid(node->val().partitionid);
-    }
-    layers_.push_back(newlayer);
-    name2layer_[node->name()]=newlayer;
   }
+}
 
-  // connect Layers.
-  for(SNode node: graph_.nodes()){
-    auto layer=name2layer_[node->name()];
-    layer->ClearDstLayers();
-    for(SNode dst: node->dstnodes())
-      layer->AddDstLayer(name2layer_[dst->name()]);
-    layer->ClearSrcLayers();
-    for(SNode src: node->srcnodes())
-      layer->AddSrcLayer(name2layer_[src->name()]);
-  }
+// add a node for SliceLayer between srcnode and dstnodes
+Node* SliceNode(Graph* graph, Node* srcnode,
+    const vector<Node*>& dstnodes, bool connect_dst) {
+  string name = srcnode->name + "<";
+  LayerProto *proto = new LayerProto();
+  proto->set_name(name);
+  proto->set_type(LayerProto_LayerType_kSlice);
+  proto->set_partition_id(
+      static_cast<LayerProto*>(srcnode->proto)->partition_id());
+  auto conf = proto->mutable_slice_conf();
+  conf->set_slice_dim(
+      static_cast<LayerProto*>(dstnodes[0]->proto)->partition_dim());
+  Node* node = new Node(name, "##" + name, proto->partition_id(), proto);
+  graph->AddNode(node);
+  graph->AddEdge(srcnode, node);
+  if (connect_dst)
+    for (Node* dst : dstnodes)
+      graph->AddEdge(node, dst);
+  return node;
+}
 
-  LOG(INFO)<<"Adjacency matrix\n"<<ToAdjacency();
+// add a node for ConcateLayer between srcnodes and dstnode
+Node* ConcateNodes(Graph* graph, const vector<Node*>& srcnodes, Node* dstnode) {
+  string name = ">" + dstnode->name;
+  LayerProto *proto = new LayerProto();
+  proto->set_name(name);
+  proto->set_type(LayerProto_LayerType_kConcate);
+  proto->set_partition_id(
+      static_cast<LayerProto*>(dstnode->proto)->partition_id());
+  auto conf = proto->mutable_concate_conf();
+  conf->set_concate_dim(
+      static_cast<LayerProto*>(srcnodes[0]->proto)->partition_dim());
+  Node* node = new Node(name, "##" + name, proto->partition_id(), proto);
+  graph->AddNode(node);
+  graph->AddEdge(node, dstnode);
+  for (Node* src : srcnodes)
+    graph->AddEdge(src, node);
+  return node;
+}
 
-  // set up layers after
-  int paramid=0;
-  for(shared_ptr<Layer> layer: layers_){
-    const vector<int>& shape=layer->shape(nullptr);
-    layer->SetupAfterPartition();
-    for(auto param: layer->GetParams())
-      param->set_id(paramid++);
-    const vector<int>& newshape=layer->shape(nullptr);
-    if(shape.size())
-      CHECK(std::equal(shape.begin(),shape.end(),newshape.begin()));
-  }
+// add a node for SplitLayer between srcnode and dstnodes
+Node* SplitNode(Graph* graph, Node* srcnode, const vector<Node*>& dstnodes) {
+  string name = srcnode->name + "+";
+  LayerProto *proto = new LayerProto();
+  proto->set_name(name);
+  proto->set_type(LayerProto_LayerType_kSplit);
+  proto->set_partition_id(
+      static_cast<LayerProto*>(srcnode->proto)->partition_id());
+  Node* node = new Node(name, "##" + name, proto->partition_id(), proto);
+  graph->AddNode(node);
+  graph->AddEdge(srcnode, node);
+  for (Node* dst : dstnodes)
+    graph->AddEdge(node, dst);
+  return node;
+}
 
-  // share Params for layers generated from the same origin layer due to
-  // data partition
-  for(auto & entry: share_conf_layers){
-    auto layers= entry.second;
-    auto owner=layers.begin();
-    auto owner_confs=(*owner)->GetParams();
-    for(auto it=owner+1; it!=layers.end();it++){
-      auto params=(*it)->GetParams();
-      CHECK_EQ(params.size(), owner_confs.size());
-      for(size_t i=0;i<params.size();i++)
-        params.at(i)->ShareData(owner_confs.at(i));
-    }
-  }
-  LOG(INFO)<<"network graph after partition layers\n"<<ToString();
+// add a pair of nodes for BridgeSrcLayer and BridgeDstLayer between srcnode
+// and dstnode
+void BridgeNodes(Graph* graph, Node* srcnode, Node* dstnode) {
+  string sname = srcnode->name + ":-";
+  LayerProto *sproto = new LayerProto();
+  sproto->set_name(sname);
+  sproto->set_type(LayerProto_LayerType_kBridgeSrc);
+  sproto->set_partition_id(
+      static_cast<LayerProto*>(srcnode->proto)->partition_id());
+  auto sbridge = new Node(sname, "##" + sname, sproto->partition_id(), sproto);
+  string dname = "-:" + dstnode->name;
+  LayerProto *dproto = new LayerProto();
+  dproto->set_name(dname);
+  dproto->set_type(LayerProto_LayerType_kBridgeDst);
+  dproto->set_partition_id(
+      static_cast<LayerProto*>(dstnode->proto)->partition_id());
+  auto dbridge = new Node(dname, "##" + dname, dproto->partition_id(), dproto);
+  graph->AddNode(sbridge);
+  graph->AddNode(dbridge);
+  graph->AddEdge(srcnode, sbridge);
+  graph->AddEdge(sbridge, dbridge);
+  graph->AddEdge(dbridge, dstnode);
 }
 
-Graph NeuralNet::CreatePartitonedGraph(const vector<shared_ptr<Layer>>& layers,
-    const map<string, shared_ptr<Layer>>& name2layer){
-  Graph graph;
-  // partition origin nodes/layers
-  map<string, vector<SNode>> layer2nodes; //from name of original layer to nodes
-  int gsize=group_size_;
-  for(const auto& layer: layers){
-    vector<SNode> nodes;
-    if(layer->partition_type()==kDataPartition||
-        layer->partition_type()==kLayerPartition){
+Graph* NeuralNet::CreateGraph(const NetProto& netproto, int npartitions) {
+  Graph *graph = new Graph();
+  // from name of original layer to nodes
+  map<string, vector<Node*>> name2nodes;
+  map<string, const LayerProto*> name2proto;
+  for (const auto& layer : netproto.layer()) {
+    vector<Node*> nodes;
+    int pdim = layer.partition_dim();
+    if (pdim == 0 || pdim == 1) {
       char suffix[4];
-      for(int i=0;i<gsize;i++){
-        sprintf(suffix, "%02d", i);
+      for (int i = 0; i < npartitions; i++) {
+        LayerProto *proto = new LayerProto(layer);
+        snprintf(suffix, sizeof(suffix), "%02d", i);
         // differentiate partitions
-        string nodename=layer->name()+"@"+string(suffix);
-        auto node=graph.AddNode(nodename, LayerInfo{layer->name(), i,-1,-1});
+        string nodename = layer.name() + "@" + string(suffix);
+        proto->set_partition_id(i);
+        auto node = new Node(nodename, layer.name(), i, proto);
+        graph->AddNode(node);
         nodes.push_back(node);
       }
-    }else if(layer->partition_type()==kNone){
-      auto node=graph.AddNode(layer->name(),
-          LayerInfo{layer->name(), 0,-1,-1});
+    } else if (pdim == -1) {
+      LayerProto *proto = new LayerProto(layer);
+      auto node = new Node(layer.name(), layer.name(), 0, proto);
+      graph->AddNode(node);
       nodes.push_back(node);
-    }else{
-      LOG(FATAL)<<"Unknown partition type "<<layer->partition_type();
+    } else {
+      LOG(FATAL) << "Cannot partition layer (" << layer.name() <<") on dim: "
+        << layer.partition_dim();
     }
-    layer2nodes[layer->name()]=nodes;
+    name2nodes[layer.name()] = nodes;
+    name2proto[layer.name()] = &layer;
   }
 
-  // connect nodes, nodes for ConcateLayer and SliceLayer are added.
-  for(shared_ptr<Layer> layer: layers){
-    string name=layer->name();
-    PartitionType type=layer->partition_type();
-    const vector<SNode>& nodes=layer2nodes.at(name);
-    for(int srcid=0;srcid<layer->srclayers_size();srcid++){
-      shared_ptr<Layer> srclayer=layer->srclayers()[srcid];
-      string srcname=srclayer->name();
-      const vector<SNode> srcnodes=layer2nodes.at(srcname);
-      PartitionType srctype=srclayer->partition_type();
-      ConnectionType connection=layer->connection_type(srcid);
-      if(srctype==kNone){
-        CHECK_EQ(srcnodes.size(),1)
-          <<"local layer "<<srcname<<" should not be partitioned";
-        SNode srcnode=srcnodes[0];
-        if(type==kDataPartition||(type==kLayerPartition&&connection==kOneToOne)){
-          LayerInfo info=srcnode->val();
-          info.slice_dimension=name2layer.at(name)->partition_dimension();
-          graph.InsertSliceNode(srcnode, nodes, info);
-        } else if(type==kNone){
-          CHECK_EQ(nodes.size(),1)
-            <<"local layer "<<name<<" should not be nodeed";
-          graph.AddEdge(srcnode, nodes[0]);
-        } else { // type==kLayerPartition&&connection==kOneToAll
-          graph.InsertSplitNode(srcnode, nodes);
-        }
-      }else if((type==kNone
-                &&(srctype==kDataPartition||srctype==kLayerPartition))
-               ||(type==kLayerPartition&&connection==kOneToAll&&
-                  (srctype==kDataPartition||srctype==kLayerPartition))){
+  // connect nodes, nodes for ConcateLayer, SliceLayer and SplitLayer are added.
+  auto* factory = Singleton<Factory<Layer>>::Instance();
+  for (const auto& layerproto : netproto.layer()) {
+    string name = layerproto.name();
+    int pdim = layerproto.partition_dim();
+    const vector<Node*>& nodes = name2nodes.at(name);
+    for (auto srcname : layerproto.srclayers()) {
+      const vector<Node*>& srcnodes = name2nodes.at(srcname);
+      // TODO(wangwei): consider the type of each connection
+      auto *layer = factory->Create(layerproto.type());
+      ConnectionType connection = layer->src_neuron_connection(0);
+      delete layer;
+      int src_pdim = name2proto[srcname]->partition_dim();
+      // no partition of src layer
+      if (src_pdim == -1) {
+        Node* srcnode = srcnodes[0];
+        if (pdim == 0 || (pdim == 1 && connection == kOneToOne))
+          SliceNode(graph, srcnode, nodes, true);
+        else if (pdim == -1)
+          graph->AddEdge(srcnode, nodes[0]);
+        else  // type==kLayerPartition&&connection==kOneToAll
+          SplitNode(graph, srcnode, nodes);
+      } else if ((pdim == -1 && (src_pdim == 0 || src_pdim == 1))
+          ||(pdim == 1 && connection == kOneToAll && src_pdim == 0)) {
         // copy/concate the whole srclayer for every dst partition
-        for(SNode node:nodes){
-          LayerInfo info=node->val();
-          info.concate_dimension=name2layer.at(srcname)->partition_dimension();
-          CHECK_GE(info.concate_dimension,0);
-          graph.InsertConcateNode(srcnodes, node, info);
-        }
-      }else if((srctype==kLayerPartition&&type==kDataPartition)
-          || (srctype==kDataPartition&&type==kLayerPartition)){
+        for (Node* node : nodes)
+          ConcateNodes(graph, srcnodes, node);
+      } else if ((src_pdim == 1 && pdim == 0) || (src_pdim == 0 && pdim == 1)) {
         // the most complext scenario
-        vector<SNode> slicenodes;
-        for(SNode srcnode: srcnodes){
-          LayerInfo info=srcnode->val();
-          info.slice_dimension=name2layer.at(name)->partition_dimension();
-          slicenodes.push_back(graph.InsertSliceNode(srcnode, nodes,
-              info, false));
-        }
-        for(SNode node: nodes){
-          LayerInfo info=node->val();
-          info.concate_dimension=name2layer.at(srcname)->partition_dimension();
-          CHECK_GE(info.concate_dimension,0);
-          graph.InsertConcateNode(slicenodes, node, info);
-        }
-      }else if((srctype==kDataPartition&&type==kDataPartition)||
-          (srctype==kLayerPartition&&type==kLayerPartition&&
-           layer->connection_type(srcid)==kOneToOne)){
+        vector<Node*> nodes;
+        for (Node* srcnode : srcnodes)
+          nodes.push_back(SliceNode(graph, srcnode, nodes, false));
+        for (Node* node : nodes)
+          ConcateNodes(graph, nodes, node);
+      } else if ((src_pdim == 0 && pdim == 0)||
+          (src_pdim == 1 && pdim == 1 && connection == kOneToOne)) {
         CHECK_EQ(srcnodes.size(), nodes.size());
-        for(size_t i=0;i<srcnodes.size();i++){
-          graph.AddEdge(srcnodes[i], nodes[i]);
-        }
+        for (size_t i = 0; i < srcnodes.size(); i++)
+          graph->AddEdge(srcnodes[i], nodes[i]);
       }
     }
   }
   // must do topology sort, because we have added new nodes.
-  graph.Sort();
-  //LOG(ERROR)<<graph.ToString();
+  graph->Sort();
 
-  // add node for split layer
-  bool data_node=true;
-  vector<SNode> oldnodes=graph.nodes();
-  for(SNode node: oldnodes){
-    if(node->dstnodes_size()>1&&node->val().origin!="kSlice"
-        &&node->val().origin!="kSplit"&&!data_node){
-      vector<SNode> dstnodes=node->dstnodes();
-      for(SNode dst: dstnodes)
-        graph.RemoveEdge(node, dst);
-      graph.InsertSplitNode(node, dstnodes);
+  // add nodes for SplitLayer
+  vector<Node*> oldnodes = graph->nodes();
+  for (Node* node : oldnodes) {
+    auto layer = factory->Create(static_cast<LayerProto*>(node->proto)->type());
+    if (node->dstnodes.size() > 1
+        && layer->dst_layer_connection() == kOneToOne) {
+      vector<Node*> dstnodes = node->dstnodes;
+      for (Node* dst : dstnodes)
+        graph->RemoveEdge(node, dst);
+      SplitNode(graph, node, dstnodes);
     }
-    data_node=false;
+    delete layer;
   }
 
-  // add bridge
-  oldnodes=graph.nodes();
-  for(SNode node: oldnodes){
-    vector<SNode> dstnodes=node->dstnodes();
-    for(size_t i=0;i<dstnodes.size();i++){
-      SNode dstnode=dstnodes.at(i);
-      if(node->val().partitionid!=dstnode->val().partitionid){
-        graph.RemoveEdge(node, dstnode);
-        graph.InsertBridgeNode(node, dstnode);
+  // add nodes for bridge layers
+  for (Node* node : oldnodes) {
+    vector<Node*> dstnodes = node->dstnodes;
+    auto pid1 = static_cast<LayerProto*>(node->proto)->partition_id();
+    for (size_t i = 0; i < dstnodes.size(); i++) {
+      Node* dstnode = dstnodes.at(i);
+      auto pid2 = static_cast<LayerProto*>(node->proto)->partition_id();
+      if (pid1 != pid2) {
+        graph->RemoveEdge(node, dstnode);
+        BridgeNodes(graph, node, dstnode);
       }
     }
   }
-  graph.Sort();
+  graph->Sort();
+  DLOG(INFO) << "Pure graph structure\n" << graph->ToJson();
   return graph;
 }
 
-std::string NeuralNet::ToString(){
-  map<string, string> info;
-  for(auto layer: layers_){
-    info[layer->name()]=IntVecToString(layer->shape(nullptr));
-  }
-  return graph_.ToString(info);
-}
-
-std::string NeuralNet::ToAdjacency(){
-  string disp="";
-  for(auto& layer: layers_){
-    disp+=layer->name()+": ";
-    for(const auto& dst: layer->dstlayers())
-      disp+=dst->name()+", ";
-    disp+="\n";
-  }
-  return disp;
-}
-
 
-void NeuralNet::ToProto(NetProto *proto, bool copyData) {
-  proto->clear_layer();
-}
+void NeuralNet::PrepareDataStructures() {
+  parserlayers_.clear();
+  losslayers_.clear();
+  datalayers_.clear();
+  params_.clear();
+  paramid2param_.clear();
+  name2layer_.clear();
 
-string NeuralNet::DebugInfo(){
-  string ret;
-  char display[4096];
-  for(auto& layer: layers_){
-    if(!layer->is_datalayer()){
-      sprintf(display, "Forward layer  %10s data norm1 %13.9f\n",
-          layer->name().c_str(), layer->data(nullptr).asum_data());
-      ret+=string(display);
-    }
-  }
-  for (auto it = layers_.rbegin(); it != layers_.rend(); it++){
-    shared_ptr<Layer> layer=*it;
-    if(!(layer->is_datalayer()||layer->is_losslayer()||layer->is_parserlayer())){
-      sprintf(display, "Backward layer %10s grad norm1 %13.9f\n",
-          layer->name().c_str(), layer->grad(nullptr).asum_data());
-      ret+=string(display);
+  for (auto& layer : layers_) {
+    name2layer_[layer->name()] = layer;
+    if (layer->is_parserlayer())
+      parserlayers_.push_back(static_cast<ParserLayer*>(layer));
+    if (layer->is_losslayer())
+      losslayers_.push_back(static_cast<LossLayer*>(layer));
+    if (layer->is_datalayer())
+      datalayers_.push_back(static_cast<DataLayer*>(layer));
+    for (Param* p : layer->GetParams()) {
+      paramid2param_[p->id()] = p;
+      params_.push_back(p);
     }
   }
-  for(auto& layer: layers_){
-    for(auto param: layer->GetParams()){
-      sprintf(display, "Layer %10s, param id %2d, name %10s,\
-          value norm1 %13.9f, grad norm1 %13.9f\n",
-          layer->name().c_str(), param->id(), param->name().c_str(),
-          param->data().asum_data(), param->grad().asum_data());
-      ret+=string(display);
-    }
+}
+std::string NeuralNet::ToAdjacency() {
+  string disp = "";
+  for (auto& layer : layers_) {
+    disp += layer->name()+": ";
+    for (const auto& dst : layer->dstlayers())
+      disp += dst->name()+", ";
+    disp += "\n";
   }
-  return ret;
+  return disp;
 }
-void NeuralNet::ShareParams(shared_ptr<NeuralNet> other, int flag){
-  for(auto& layer: layers_){
-    auto otherlayer=other->name2layer(layer->name());
-    if(otherlayer!=nullptr){
-      const auto& otherparams=otherlayer->GetParams();
-      const auto& params=layer->GetParams();
+
+void NeuralNet::ShareParams(shared_ptr<NeuralNet> other) {
+  for (auto& layer : layers_) {
+    auto otherlayer = other->name2layer(layer->name());
+    if (otherlayer != nullptr) {
+      const auto& otherparams = otherlayer->GetParams();
+      const auto& params = layer->GetParams();
       CHECK_EQ(params.size(), otherparams.size());
-      for(size_t i=0;i<params.size();i++){
+      for (size_t i = 0; i < params.size(); i++) {
         params[i]->ShareData(otherparams[i]);
       }
     }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/proto/common.proto
----------------------------------------------------------------------
diff --git a/src/proto/common.proto b/src/proto/common.proto
index 70b743c..256206c 100644
--- a/src/proto/common.proto
+++ b/src/proto/common.proto
@@ -38,6 +38,7 @@ message BlobProtos {
 enum ConnectionType {
   kOneToOne = 0;
   kOneToAll = 1;
+  kOneToMany = 2;
 }
 
 // to import caffe's lmdb dataset
@@ -79,3 +80,9 @@ message SingleLabelImageRecord {
   optional bytes pixel = 3;
   repeated float data = 4;
 }
+
+message MetricProto {
+  repeated string name =1;
+  repeated int32 count = 2;
+  repeated float val = 3;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index 4256491..a8de5d5 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -7,6 +7,8 @@ enum Phase {
   kPositive = 3;
   // negative phase for contrastive divergence algorithm
   kNegative = 4;
+  kForward = 5;
+  kBackward = 6;
 }
 
 message ModelProto {
@@ -58,7 +60,7 @@ message ModelProto {
 message NetProto {
   repeated LayerProto layer = 1;
   // partitioning type for parallelism
-  optional PartitionType partition_type = 3 [default = kNone];
+  optional int32 partition_dim = 2 [default = -1];
 }
 
 // weight matrix should be defined before bias vector
@@ -99,7 +101,7 @@ message ParamProto {
   // multiplied on the global weight decay.
   optional float weight_decay_multiplier = 16 [default = 1];
   // partition dimension, -1 for no partition
-  optional int32 partition_dim = 30 [default = -1];
+  optional int32 partition_dim = 30;
   // usually, the program will infer the param shape
   repeated int32 shape = 31;
 
@@ -185,15 +187,15 @@ message LayerProto {
   optional SplitProto split_conf = 42;
   // configuration for tanh layer
   optional TanhProto tanh_conf = 43;
-  // partition type which overrides the partition type for neural net
-  optional PartitionType partition_type = 59;
+
+
+  // overrides the partition dimension for neural net
+  optional int32 partition_dim =59 [default = -1];
   optional string datablob = 58 [default = "unknow"];
 
   // names of parameters shared from other layers
   repeated string share_param = 60;
-  // TODO(wangwei): make location ID an array
-  optional int32 locationid = 61 [default = 0];
-  optional int32 partitionid = 62 [default = 0];
+  optional int32 partition_id = 62 [default = 0];
 }
 
 message RGBImageProto {
@@ -246,9 +248,7 @@ message ConvolutionProto {
 
 message ConcateProto {
   // on which dimension, starts from 0
-  required int32 concate_dimension = 1;
-  // concatenate offset
-  optional int32 concate_num = 30;
+  required int32 concate_dim = 1;
 }
 
 message DataProto {
@@ -328,8 +328,7 @@ message PoolingProto {
 }
 
 message SliceProto{
-  required int32 slice_dimension=1;
-  required int32 slice_num=2;
+  required int32 slice_dim = 1;
 }
 
 message ReLUProto {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/trainer/server.cc
----------------------------------------------------------------------
diff --git a/src/trainer/server.cc b/src/trainer/server.cc
index 42d6a79..cbb0ee1 100644
--- a/src/trainer/server.cc
+++ b/src/trainer/server.cc
@@ -36,7 +36,7 @@ void Server::Run(){
   ping->add_frame("PING", 4);
   ping->set_type(kConnect);
   dealer_->Send(&ping);
-  vector<shared_ptr<Param>> master_params;
+  vector<Param*> master_params;
   size_t syncEntry=0;
   //start recv loop and process requests
   while (true){
@@ -121,13 +121,13 @@ void Server::Run(){
 Msg* Server::HandlePut(Msg **msg){
   int version=(*msg)->trgt_third();
   int pid=(*msg)->trgt_second();
-  shared_ptr<Param> param=nullptr;
+  Param* param=nullptr;
   if(shard_->find(pid)!=shard_->end()){
     LOG(ERROR)<<"Param ("<<pid<<") is put more than once";
     param=shard_->at(pid);
   }else{
     auto factory=Singleton<Factory<Param>>::Instance();
-    param=shared_ptr<Param>(factory ->Create("Param"));
+    param=factory ->Create("Param");
     (*shard_)[pid]=param;
   }
   auto response=param->HandlePutMsg(msg);
@@ -147,7 +147,7 @@ Msg* Server::HandlePut(Msg **msg){
   return response;
 }
 
-Msg* Server::HandleGet(shared_ptr<Param> param, Msg **msg){
+Msg* Server::HandleGet(Param* param, Msg **msg){
   if(param->version()<(*msg)->trgt_third())
     return *msg;
   else{
@@ -158,7 +158,7 @@ Msg* Server::HandleGet(shared_ptr<Param> param, Msg **msg){
   }
 }
 
-Msg* Server::HandleUpdate(shared_ptr<Param> param, Msg **msg) {
+Msg* Server::HandleUpdate(Param* param, Msg **msg) {
   auto* tmp=static_cast<Msg*>((*msg)->CopyAddr());
   tmp->SwapAddr();
   int paramid=(*msg)->trgt_first();
@@ -174,7 +174,7 @@ Msg* Server::HandleUpdate(shared_ptr<Param> param, Msg **msg) {
   return response;
 }
 
-Msg* Server::HandleSyncRequest(shared_ptr<Param> param, Msg **msg){
+Msg* Server::HandleSyncRequest(Param* param, Msg **msg){
   Msg* response=nullptr;
   auto shape=Shape1(param->size());
   CHECK_EQ((*msg)->frame_size(), param->size()*sizeof(float));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index ce135cc..f4e52a6 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -7,14 +7,15 @@
 #include "proto/common.pb.h"
 #include "trainer/trainer.h"
 #include "mshadow/tensor.h"
+
+namespace singa {
 using std::vector;
 using std::map;
 using namespace std::chrono;
+using std::make_shared;
 
 typedef std::chrono::milliseconds TimeT;
 
-namespace singa {
-
 void Trainer::RegisterDefaultClasses(const singa::ModelProto& proto){
   // register all layers appearing in the neural net
   singa::NeuralNet::RegisterLayers();
@@ -33,8 +34,8 @@ void HandleWorkerFinish(void * ctx){
   hctx->dealer->Send(&msg);
 }
 
-const std::unordered_map<int, vector<std::pair<int, int>>> SliceParams(int num,
-    const vector<shared_ptr<Param>>& params){
+const std::unordered_map<int, vector<std::pair<int, int>>>
+SliceParams(int num, const vector<Param*>& params){
   std::unordered_map<int, vector<std::pair<int, int>>> paramid2slices;
   if (num==0)
     return paramid2slices;
@@ -114,15 +115,15 @@ const vector<int> PartitionSlice(int num, const vector<int>& slices){
       previd=slice2box[i];
     } else
       disp+=" "+std::to_string(slices[i]);
-  LOG(INFO)<<"partition slice (av ="<<avg<<", num="<<num<<"):"<<disp;
+  LOG(INFO)<<"partition slice (avg ="<<avg<<", num="<<num<<"):"<<disp;
   return slice2box;
 }
-vector<shared_ptr<Server>> Trainer::CreateServers(int nthreads,
+vector<Server*> Trainer::CreateServers(int nthreads,
     const ModelProto & mproto,
     const vector<int> slices,
     vector<HandleContext*>* ctx){
   auto cluster=Cluster::Get();
-  vector<shared_ptr<Server>> servers;
+  vector<Server*> servers;
   if(!cluster->has_server())
     return servers;
 
@@ -139,7 +140,7 @@ vector<shared_ptr<Server>> Trainer::CreateServers(int nthreads,
     auto dealer=make_shared<Dealer>();
     dealer->Connect(kInprocRouterEndpoint);
     for(int sid=start;sid<end;sid++){
-      auto server=make_shared<Server>(nthreads++, gid, sid);
+      auto server=new Server(nthreads++, gid, sid);
       server->Setup(mproto.updater(), server_shard_, slice2group);
       servers.push_back(server);
       auto *hc=new HandleContext{dealer, gid, sid};
@@ -151,20 +152,20 @@ vector<shared_ptr<Server>> Trainer::CreateServers(int nthreads,
   return servers;
 }
 
-vector<shared_ptr<Worker>> Trainer::CreateWorkers(int nthreads,
+vector<Worker*> Trainer::CreateWorkers(int nthreads,
     const ModelProto& mproto, vector<int> *slice_size){
   auto cluster=Cluster::Get();
-  auto net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTrain,
+  auto net=NeuralNet::Create(mproto.neuralnet(), kTrain,
       cluster->nworkers_per_group());
   int lcm=LeastCommonMultiple(cluster->nserver_groups(), cluster->nservers_per_group());
   auto paramid2slices=SliceParams(lcm, net->params()); // sliceid, size
   for(auto param: net->params()){
-    if(param->id()==param->owner())
+    if(param->id() == param->owner())
       for(auto entry: paramid2slices[param->id()])
         slice_size->push_back(entry.second);
   }
 
-  vector<shared_ptr<Worker>> workers;
+  vector<Worker*> workers;
   if(!cluster->has_worker())
     return workers;
   //LOG(ERROR)<<net->ToString();
@@ -191,33 +192,33 @@ vector<shared_ptr<Worker>> Trainer::CreateWorkers(int nthreads,
     if(gid==gstart)
       train_net=net;
     else{
-      train_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTrain,
+      train_net=NeuralNet::Create(mproto.neuralnet(), kTrain,
           cluster->nworkers_per_group());
       // the train net for other groups may share parameter values from the
       // first group
       if(cluster->share_memory())
-        train_net->ShareParams(net, kValueOnly);
+        train_net->ShareParams(net);
     }
     if(gid==0){
       // validation and test are performed only by the first group
       if(mproto.test_steps()){
-        test_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTest,
+        test_net=NeuralNet::Create(mproto.neuralnet(), kTest,
             cluster->nworkers_per_group());
         if(test_net!=nullptr)
-          test_net->ShareParams(train_net, kValueOnly);
+          test_net->ShareParams(train_net);
       }
       if(mproto.validation_steps()){
-        validation_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kValidation,
+        validation_net=NeuralNet::Create(mproto.neuralnet(), kValidation,
             cluster->nworkers_per_group());
         if(validation_net!=nullptr)
-          validation_net->ShareParams(train_net, kValueOnly);
+          validation_net->ShareParams(train_net);
       }
     }
     // create ServerShard for the workers
     auto shard=make_shared<WorkerShard>();
     worker_shards_[gid]=shard;
     for(auto layer: train_net->layers()){
-      int procsid=cluster->ProcsIDOf(gid, layer->partitionid(), kWorkerLayer);
+      int procsid=cluster->ProcsIDOf(gid, layer->partition_id(), kWorkerLayer);
       bool local=procsid==cluster->procs_id();
       for(auto param: layer->GetParams()){
         for(auto entry :paramid2slices[param->owner()]){
@@ -232,9 +233,9 @@ vector<shared_ptr<Worker>> Trainer::CreateWorkers(int nthreads,
       }
     }
     for(int wid=wstart;wid<wend;wid++){
-      shared_ptr<Worker> worker=nullptr;
+      Worker* worker=nullptr;
       if(mproto.alg()==ModelProto_GradCalcAlg_kBackPropagation)
-        worker=make_shared<BPWorker>(nthreads++,gid, wid);
+        worker = new BPWorker(nthreads++,gid, wid);
       else{
         // TODO add CDWorker
       }
@@ -267,13 +268,13 @@ void Trainer::Start(const ModelProto& mproto, const ClusterProto& cproto,
   int nthreads=1;
   // create workers
   vector<int> slices;
-  vector<shared_ptr<Worker>> workers=CreateWorkers(nthreads, mproto, &slices);
+  vector<Worker*> workers=CreateWorkers(nthreads, mproto, &slices);
   if(cluster->nserver_groups()&&cluster->nservers_per_group())
     slice2server_=PartitionSlice(cluster->nservers_per_group(), slices);
   nthreads+=workers.size();
   // create servers
   vector<HandleContext*> ctx;
-  vector<shared_ptr<Server>> servers=CreateServers(nthreads, mproto, slices,
+  vector<Server*> servers=CreateServers(nthreads, mproto, slices,
       &ctx);
 
 #ifdef USE_MPI
@@ -283,14 +284,18 @@ void Trainer::Start(const ModelProto& mproto, const ClusterProto& cproto,
 #endif
   vector<std::thread> threads;
   for(auto server: servers)
-    threads.push_back(std::thread(&Server::Run,server.get()));
+    threads.push_back(std::thread(&Server::Run,server));
   for(auto worker: workers)
-    threads.push_back(std::thread(&Worker::Run,worker.get()));
+    threads.push_back(std::thread(&Worker::Run,worker));
   Run(workers, servers);
   for(auto& thread: threads)
     thread.join();
   for(auto x: ctx)
     delete x;
+  for(auto x : servers)
+    delete x;
+  for(auto x : workers)
+    delete x;
 }
 
 inline int bandwidth(int bytes, system_clock::time_point start){
@@ -299,8 +304,8 @@ inline int bandwidth(int bytes, system_clock::time_point start){
   return static_cast<int>(bytes*1000.f/duration.count());
 }
 
-void Trainer::Run(const vector<shared_ptr<Worker>>& workers,
-    const vector<shared_ptr<Server>>& servers){
+void Trainer::Run(const vector<Worker*>& workers,
+    const vector<Server*>& servers){
   auto cluster=Cluster::Get();
   procs_id_=cluster->procs_id();
   LOG(INFO)<<"Stub in process "<<procs_id_<<" starts";
@@ -364,8 +369,8 @@ void Trainer::Run(const vector<shared_ptr<Worker>>& workers,
             string prefix((char*)msg->frame_data(), msg->frame_size());
             msg->next_frame();
             Metric cur;
-            cur.ParseString(string((char*)msg->frame_data(), msg->frame_size()));
-            LOG(ERROR)<<prefix<<" step-" <<step<<", "<<cur.ToString();
+            cur.ParseFrom(string((char*)msg->frame_data(), msg->frame_size()));
+            LOG(ERROR)<<prefix<<" step-" <<step<<", "<<cur.ToLogString();
           }
           DeleteMsg(&msg);
         }else if(cluster->nserver_groups()>0){

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index a92ba2c..80a6283 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -8,8 +8,10 @@
 #include "utils/factory.h"
 #include "trainer/worker.h"
 #include "proto/model.pb.h"
-using std::thread;
 namespace singa {
+using std::thread;
+using std::make_shared;
+
 Worker::Worker(int thread_id, int group_id, int worker_id):
   thread_id_(thread_id), group_id_(group_id), worker_id_(worker_id){
 }
@@ -52,7 +54,7 @@ void Worker::Run(){
   dealer_=make_shared<Dealer>(2*thread_id_);
   ConnectStub(dealer_, kWorkerParam);
   for(auto layer: train_net_->layers())
-    if(layer->partitionid()==worker_id_)
+    if(layer->partition_id()==worker_id_)
       if(layer->is_bridgedstlayer()||layer->is_bridgesrclayer()){
         layer_dealer_=make_shared<Dealer>(2*thread_id_+1);
         ConnectStub(layer_dealer_, kWorkerLayer);
@@ -61,7 +63,7 @@ void Worker::Run(){
   step_=modelproto_.step();
   // init params
   for(auto layer: train_net_->layers()){
-    if(layer->partitionid()==worker_id_)
+    if(layer->partition_id()==worker_id_)
       for(auto param: layer->GetParams()){
         // only owners fill the memory of parameter values.
         // others share the memory with owners hence do not need to put/get.
@@ -79,7 +81,7 @@ void Worker::Run(){
     for(step_=0;step_<modelproto_.warmup_steps();step_++)
       RunOneBatch(step_, &perf);
     for(auto layer: train_net_->layers()){
-      if(layer->partitionid()==worker_id_)
+      if(layer->partition_id()==worker_id_)
         for(auto param: layer->GetParams())
           if(param->owner()==param->id())
             Put(param, step_);
@@ -107,7 +109,7 @@ void Worker::Stop(){
   msg->set_type(kStop);
   dealer_->Send(&msg); // use param dealer to send the stop msg
 }
-int Worker::Put(shared_ptr<Param> param, int step){
+int Worker::Put(Param* param, int step){
   Msg* msg=new Msg();
   msg->set_src(group_id_, worker_id_, kWorkerParam);
   msg->set_dst(-1, -1, kStub);
@@ -116,7 +118,7 @@ int Worker::Put(shared_ptr<Param> param, int step){
   dealer_->Send(&msg);
   return 1;
 }
-int Worker::Get(shared_ptr<Param> param, int step){
+int Worker::Get(Param* param, int step){
   Msg* msg=new Msg();
   msg->set_src(group_id_, worker_id_, kWorkerParam);
   msg->set_dst(-1, -1, kStub);
@@ -125,7 +127,7 @@ int Worker::Get(shared_ptr<Param> param, int step){
   dealer_->Send(&msg);
   return 1;
 }
-int Worker::Update(shared_ptr<Param> param, int step){
+int Worker::Update(Param* param, int step){
   param->set_local_version(param->version());
   if(updater_){
     updater_->Update(step, param);
@@ -144,30 +146,29 @@ int Worker::Update(shared_ptr<Param> param, int step){
 int Worker::CollectAll(shared_ptr<NeuralNet> net, int step){
   auto& layers=net->layers();
   for(auto& layer: layers){
-    if(layer->partitionid()==worker_id_)
-      for(shared_ptr<Param> p: layer->GetParams()){
+    if(layer->partition_id()==worker_id_)
+      for(Param* p: layer->GetParams()){
         Collect(p, step);
       }
   }
   return 1;
 }
-int Worker::Collect(shared_ptr<Param> param, int step){
+int Worker::Collect(Param* param, int step){
   while(param->version()<=param->local_version()){
     std::this_thread::sleep_for(std::chrono::milliseconds(kCollectSleepTime));
   }
   return 1;
 }
-const void Worker::DisplayPerformance(const Metric & perf, const string& prefix){
+void Worker::DisplayPerformance(const string& prefix, const Metric & perf) {
   Msg* msg=new Msg();
   msg->set_src(group_id_, worker_id_, kWorkerParam);
   msg->set_dst(-1,-1, kStub);
   msg->set_type(kMetric);
   msg->set_trgt(step_,0,0);
-  const string disp=perf.ToString();
   msg->add_frame(prefix.c_str(), prefix.length());
+  const string disp = perf.ToString();
   msg->add_frame(disp.c_str(), disp.length());
   dealer_->Send(&msg);
-  //LOG(ERROR)<<prefix<<" "<<perf.ToString();
 }
 
 void Worker::RunOneBatch(int step, Metric* perf){
@@ -184,10 +185,8 @@ void Worker::RunOneBatch(int step, Metric* perf){
   TrainOneBatch(step, perf);
   //LOG(ERROR)<<"Train "<<step;
   if(perf!=nullptr){
-    perf->Inc();
     if(DisplayNow(step)){
-      //perf->Avg();
-      DisplayPerformance(*perf, "Train");
+      DisplayPerformance("Train", *perf);
       perf->Reset();
     }
   }
@@ -208,13 +207,12 @@ void Worker::Test(int nsteps, Phase phase, shared_ptr<NeuralNet> net){
   Metric perf;
   for(int step=0;step<nsteps;step++){
     TestOneBatch(step, phase, net, &perf);
-    perf.Inc();
   }
   //perf.Avg();
   if(phase==kValidation)
-    DisplayPerformance(perf, "Validation");
+    DisplayPerformance("Validation", perf);
   else if (phase==kTest)
-    DisplayPerformance(perf, "Test");
+    DisplayPerformance("Test", perf);
 }
 
 /****************************BPWorker**********************************/
@@ -223,19 +221,20 @@ BPWorker::BPWorker(int thread_id, int group_id, int worker_id):
   Worker(thread_id, group_id, worker_id){
 }
 
-void BPWorker::Forward(int step, Phase phase, shared_ptr<NeuralNet> net){
+void BPWorker::Forward(int step, Phase phase, shared_ptr<NeuralNet> net,
+    Metric* perf){
   auto& layers=net->layers();
   for(auto& layer: layers){
-    if(layer->partitionid()==worker_id_){
+    if(layer->partition_id()==worker_id_){
       if(layer->is_bridgedstlayer()){
-        auto* dst=static_cast<BridgeDstLayer*>(layer.get());
+        auto* dst=static_cast<BridgeDstLayer*>(layer);
         while(!dst->ready()){
           auto msg=layer_dealer_->Receive();
           CHECK_EQ(msg->src_first(), group_id_);
           string name((char*)msg->frame_data(), msg->frame_size());
           auto tmp=net->name2layer(name);
           CHECK(tmp->is_bridgedstlayer());
-          auto* dstlayer=static_cast<BridgeDstLayer*>(tmp.get());
+          auto* dstlayer=static_cast<BridgeDstLayer*>(tmp);
           auto data=dstlayer->mutable_data(nullptr);
           msg->next_frame();
           memcpy(data->mutable_cpu_data(), msg->frame_data(), msg->frame_size());
@@ -244,28 +243,25 @@ void BPWorker::Forward(int step, Phase phase, shared_ptr<NeuralNet> net){
         }
       }
       if(phase==kTrain){
-        for(shared_ptr<Param> p: layer->GetParams()){
+        for(Param* p: layer->GetParams()){
           Collect(p, step);
         }
       }
       //clock_t s=clock();
-      layer->ComputeFeature(phase);
+      layer->ComputeFeature(phase, perf);
       //LOG(ERROR)<<layer->name()<<":"<<(clock()-s)*1.0/CLOCKS_PER_SEC;
       if(layer->is_bridgesrclayer()){
         auto dst=layer->dstlayers().at(0);
         Msg *msg=new Msg();
         msg->set_src(group_id_, worker_id_, kWorkerLayer);
-        msg->set_dst(group_id_, dst->partitionid(), kWorkerLayer);
+        msg->set_dst(group_id_, dst->partition_id(), kWorkerLayer);
         msg->add_frame(dst->name().c_str(), dst->name().length());
         auto const & blob=layer->data(nullptr);
         msg->add_frame(blob.cpu_data(), blob.count()*sizeof(float));
         layer_dealer_->Send(&msg);
       }
-      if(phase==kTrain&&DisplayDebugInfo(step)
-          &&layer->mutable_data(nullptr)!=nullptr){
-        LOG(INFO)<<StringPrintf("Forward layer  %10s data norm1 %13.9f",
-            layer->name().c_str(), layer->data(nullptr).asum_data());
-      }
+      if(phase == kTrain && DisplayDebugInfo(step))
+        LOG(INFO) << layer->DebugString(step, kForward);
     }
   }
 }
@@ -273,25 +269,17 @@ void BPWorker::Forward(int step, Phase phase, shared_ptr<NeuralNet> net){
 void BPWorker::Backward(int step, shared_ptr<NeuralNet> net){
   auto& layers=net->layers();
   for (auto it = layers.rbegin(); it != layers.rend(); it++){
-    shared_ptr<Layer> layer=*it;
-    if(layer->partitionid()==worker_id_){
+    Layer* layer=*it;
+    if(layer->partition_id()==worker_id_){
       if(layer->is_bridgesrclayer()){
         //auto* src=static_cast<BridgeSrcLayer*>(layer.get());
         // receive grad blobs
       }
-      layer->ComputeGradient();
-      if(layer->mutable_grad(nullptr)!=nullptr&&DisplayDebugInfo(step)){
-        LOG(INFO)<<StringPrintf("Backward layer %10s grad norm1 %13.9f\t",
-            layer->name().c_str(), layer->grad(nullptr).asum_data());
-        for(shared_ptr<Param> p: layer->GetParams())
-          LOG(INFO)<<StringPrintf("param id %2d, name %10s,\
-              value norm1 %13.9f, grad norm1 %13.9f",
-              p->id(), p->name().c_str(),
-              p->data().asum_data(), p->grad().asum_data());
-      }
-      for(shared_ptr<Param> p: layer->GetParams()){
+      layer->ComputeGradient(kTrain);
+      if(DisplayDebugInfo(step))
+        LOG(INFO) << layer->DebugString(step, kBackward);
+      for(Param* p: layer->GetParams())
         Update(p, step);
-      }
       if(layer->is_bridgedstlayer()){
         // send grad blobs
       }
@@ -300,38 +288,14 @@ void BPWorker::Backward(int step, shared_ptr<NeuralNet> net){
 }
 
 void BPWorker::TrainOneBatch(int step, Metric* perf){
-  Forward(step, kTrain, train_net_);
+  Forward(step, kTrain, train_net_, perf);
   Backward(step, train_net_);
   auto losslayers=train_net_->losslayers();
-  for(auto layer: losslayers){
-      if(layer->partitionid()==worker_id_){
-        const float * ptr=layer->metric().cpu_data();
-        /*
-        for(int j=0;j<layer->metric().count();j++)
-          perf->AddMetric(std::to_string(j)+"#"+layer->name(), ptr[j]);
-        */
-        // hard code display info
-        perf->AddMetric(std::to_string(0)+"#loss", ptr[0]);
-        perf->AddMetric(std::to_string(1)+"#accuracy", ptr[1]);
-      }
-    }
 }
 
-void BPWorker::TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net, Metric* perf){
-  Forward(step, phase, net);
-  const auto& losslayers=net->losslayers();
-  for(auto layer: losslayers){
-      if(layer->partitionid()==worker_id_){
-        const float * ptr=layer->metric().cpu_data();
-        /*
-        for(int j=0;j<layer->metric().count();j++)
-          perf.AddMetric(std::to_string(j)+"#"+layer->name(), ptr[j]);
-        */
-        // hard code display info
-        perf->AddMetric(std::to_string(0)+"#loss", ptr[0]);
-        perf->AddMetric(std::to_string(1)+"#accuracy", ptr[1]);
-      }
-    }
+void BPWorker::TestOneBatch(int step, Phase phase,
+    shared_ptr<NeuralNet> net, Metric* perf){
+  Forward(step, phase, net, perf);
 }
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9a6e09fa/src/utils/common.cc
----------------------------------------------------------------------
diff --git a/src/utils/common.cc b/src/utils/common.cc
index 67b4486..11a19f8 100644
--- a/src/utils/common.cc
+++ b/src/utils/common.cc
@@ -160,4 +160,52 @@ void SetupLog(const std::string& log_dir, const std::string& model) {
   google::SetLogDestination(google::FATAL, fatal.c_str());
 }
 
+void Metric::Add(const string& name, float value) {
+  if(entry_.find(name) == entry_.end())
+    entry_[name] = std::make_pair(1, value);
+  else{
+    auto& e = entry_.at(name);
+    e.first += 1;
+    e.second += value;
+  }
+}
+
+void Metric::Reset() {
+  for(auto e : entry_) {
+    e.second.first = 0;
+    e.second.second = 0;
+  }
+}
+const string Metric::ToLogString() const{
+  string ret;
+  size_t k = 0;
+  for(auto e : entry_) {
+    ret += e.first + " : " ;
+    ret += std::to_string(e.second.second / e.second.first);
+    if(++k < entry_.size())
+      ret +=  ", ";
+  }
+  return ret;
+}
+
+const string Metric::ToString() const{
+  MetricProto proto;
+  for(auto e : entry_) {
+    proto.add_name(e.first);
+    proto.add_count(e.second.first);
+    proto.add_val(e.second.second);
+  }
+  string ret;
+  proto.SerializeToString(&ret);
+  return ret;
+}
+
+void Metric::ParseFrom(const string& msg) {
+  MetricProto proto;
+  proto.ParseFromString(msg);
+  Reset();
+  for(int i = 0; i < proto.name_size(); i++) {
+    entry_[proto.name(i)] = std::make_pair(proto.count(i), proto.val(i));
+  }
+}
 }  // namespace singa