You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/05/03 16:04:06 UTC

[01/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Repository: incubator-singa
Updated Branches:
  refs/heads/master 95b1e6dd3 -> b2dc51d23


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/cluster.cc
----------------------------------------------------------------------
diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc
new file mode 100644
index 0000000..ac47422
--- /dev/null
+++ b/src/utils/cluster.cc
@@ -0,0 +1,52 @@
+#include <glog/logging.h>
+#include <fcntl.h>
+#include <fstream>
+#include "utils/cluster.h"
+#include "proto/cluster.pb.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+namespace singa {
+
+std::shared_ptr<Cluster> Cluster::instance_;
+Cluster::Cluster(const ClusterProto &cluster, int procs_id) {
+  procs_id_=procs_id;
+  cluster_ = cluster;
+  SetupFolders(cluster);
+  int nprocs;
+  if(server_worker_separate())
+    nprocs=nworker_procs()+nserver_procs();
+  else
+    nprocs=std::max(nworker_procs(), nserver_procs());
+  CHECK_LT(procs_id, nprocs);
+  if (cluster_.has_nprocs())
+    CHECK_EQ(cluster.nprocs(), nprocs);
+  else
+    cluster_.set_nprocs(nprocs);
+  if(nprocs>1){
+    std::ifstream ifs(cluster.hostfile(), std::ifstream::in);
+    std::string line;
+    while(std::getline(ifs, line)&&endpoints_.size()<nprocs){
+      endpoints_.push_back(line);
+    }
+    CHECK_EQ(endpoints_.size(), nprocs);
+  }
+}
+
+void Cluster::SetupFolders(const ClusterProto &cluster){
+  // create visulization folder
+  mkdir(vis_folder().c_str(),  S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+}
+
+shared_ptr<Cluster> Cluster::Get(const ClusterProto& cluster, int procs_id){
+  instance_.reset(new Cluster(cluster, procs_id));
+  return instance_;
+}
+
+shared_ptr<Cluster> Cluster::Get() {
+  if(!instance_) {
+    LOG(ERROR)<<"The first call to Get should "
+              <<"provide the sys/model conf path";
+  }
+  return instance_;
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/common.cc
----------------------------------------------------------------------
diff --git a/src/utils/common.cc b/src/utils/common.cc
new file mode 100644
index 0000000..0697060
--- /dev/null
+++ b/src/utils/common.cc
@@ -0,0 +1,89 @@
+#include <fcntl.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include "utils/common.h"
+using std::ios;
+using std::max;
+using google::protobuf::io::FileInputStream;
+using google::protobuf::io::FileOutputStream;
+using google::protobuf::io::ZeroCopyInputStream;
+using google::protobuf::io::CodedInputStream;
+using google::protobuf::io::ZeroCopyOutputStream;
+using google::protobuf::io::CodedOutputStream;
+
+namespace singa {
+
+const int kBufLen=1024;
+std::string IntVecToString(const vector<int>& vec) {
+  string disp="(";
+  for(int x: vec)
+    disp+=std::to_string(x)+", ";
+  return disp+")";
+}
+
+/**
+ * Formatted string.
+ */
+string VStringPrintf(string fmt, va_list l) {
+  char buffer[32768];
+  vsnprintf(buffer, 32768, fmt.c_str(), l);
+  return string(buffer);
+}
+
+/**
+ * Formatted string.
+ */
+string StringPrintf(string fmt, ...) {
+  va_list l;
+  va_start(l, fmt); //fmt.AsString().c_str());
+  string result = VStringPrintf(fmt, l);
+  va_end(l);
+  return result;
+}
+
+void Debug() {
+  int i = 0;
+  char hostname[256];
+  gethostname(hostname, sizeof(hostname));
+  printf("PID %d on %s ready for attach\n", getpid(), hostname);
+  fflush(stdout);
+  while (0 == i)
+    sleep(5);
+}
+
+// the proto related functions are from Caffe.
+void ReadProtoFromTextFile(const char* filename,
+    ::google::protobuf::Message* proto) {
+  int fd = open(filename, O_RDONLY);
+  CHECK_NE(fd, -1) << "File not found: " << filename;
+  FileInputStream* input = new FileInputStream(fd);
+  CHECK(google::protobuf::TextFormat::Parse(input, proto));
+  delete input;
+  close(fd);
+}
+void WriteProtoToTextFile(const Message& proto, const char* filename) {
+  int fd = open(filename, O_WRONLY | O_CREAT, 0644);
+  FileOutputStream* output = new FileOutputStream(fd);
+  CHECK(google::protobuf::TextFormat::Print(proto, output));
+  delete output;
+  close(fd);
+}
+void ReadProtoFromBinaryFile(const char* filename, Message* proto) {
+  int fd = open(filename, O_RDONLY);
+  CHECK_NE(fd, -1) << "File not found: " << filename;
+  ZeroCopyInputStream* raw_input = new FileInputStream(fd);
+  CodedInputStream* coded_input = new CodedInputStream(raw_input);
+  // upper limit 512MB, warning threshold 256MB
+  coded_input->SetTotalBytesLimit(536870912, 268435456);
+  CHECK(proto->ParseFromCodedStream(coded_input));
+  delete coded_input;
+  delete raw_input;
+  close(fd);
+}
+void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
+  int fd= open(filename, O_CREAT|O_WRONLY|O_TRUNC, 0644);
+  CHECK(proto.SerializeToFileDescriptor(fd));
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/data_shard.cc
----------------------------------------------------------------------
diff --git a/src/utils/data_shard.cc b/src/utils/data_shard.cc
new file mode 100644
index 0000000..df311e1
--- /dev/null
+++ b/src/utils/data_shard.cc
@@ -0,0 +1,207 @@
+#include <sys/stat.h>
+#include <glog/logging.h>
+
+#include "utils/data_shard.h"
+namespace singa {
+
+DataShard::DataShard(std::string folder, char mode, int capacity){
+  struct stat sb;
+  if(stat(folder.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)){
+    LOG(INFO)<<"Open shard folder "<<folder;
+  }else{
+    LOG(FATAL)<<"Cannot open shard folder "<<folder;
+  }
+
+  path_= folder+"/shard.dat";
+  if(mode==DataShard::kRead){
+    fdat_.open(path_, std::ios::in|std::ios::binary);
+    CHECK(fdat_.is_open())<<"Cannot create file "<<path_;
+  }
+  if(mode==DataShard::kCreate){
+    fdat_.open(path_, std::ios::binary|std::ios::out|std::ios::trunc);
+    CHECK(fdat_.is_open())<<"Cannot create file "<<path_;
+  }
+  if(mode==DataShard::kAppend){
+    int last_tuple=PrepareForAppend(path_);
+    fdat_.open(path_, std::ios::binary|std::ios::out|std::ios::in|std::ios::ate);
+    CHECK(fdat_.is_open())<<"Cannot create file "<<path_;
+    fdat_.seekp(last_tuple);
+  }
+
+  mode_=mode;
+  offset_=0;
+  bufsize_=0;
+  capacity_=capacity;
+  buf_=new char[capacity];
+}
+
+DataShard::~DataShard(){
+  delete buf_;
+  fdat_.close();
+}
+
+bool DataShard::Insert(const std::string& key, const Message& val) {
+  std::string str;
+  val.SerializeToString(&str);
+  return Insert(key, str);
+}
+// insert one complete tuple
+bool DataShard::Insert(const std::string& key, const std::string& val) {
+  if(keys_.find(key)!=keys_.end()||val.size()==0)
+    return false;
+  int size=key.size()+val.size()+2*sizeof(size_t);
+  if(offset_+size>capacity_){
+    fdat_.write(buf_, offset_);
+    offset_=0;
+    CHECK_LE(size, capacity_)<<"Tuple size is larger than capacity"
+      <<"Try a larger capacity size";
+  }
+  *reinterpret_cast<size_t*>(buf_+offset_)=key.size();
+  offset_+=sizeof(size_t);
+  memcpy(buf_+offset_, key.data(), key.size());
+  offset_+=key.size();
+  *reinterpret_cast<size_t*>(buf_+offset_)=val.size();
+  offset_+=sizeof(size_t);
+  memcpy(buf_+offset_, val.data(), val.size());
+  offset_+=val.size();
+  return true;
+}
+
+void DataShard::Flush() {
+  fdat_.write(buf_, offset_);
+  fdat_.flush();
+  offset_=0;
+}
+
+int DataShard::Next(std::string *key){
+  key->clear();
+  int ssize=sizeof(size_t);
+  if(!PrepareNextField(ssize))
+    return 0;
+  CHECK_LE(offset_+ssize, bufsize_);
+  int keylen=*reinterpret_cast<size_t*>(buf_+offset_);
+  offset_+=ssize;
+
+  if(!PrepareNextField(keylen))
+    return 0;
+  CHECK_LE(offset_+keylen, bufsize_);
+  for(int i=0;i<keylen;i++)
+    key->push_back(buf_[offset_+i]);
+  offset_+=keylen;
+
+  if(!PrepareNextField(ssize))
+    return 0;
+  CHECK_LE(offset_+ssize, bufsize_);
+  int vallen=*reinterpret_cast<size_t*>(buf_+offset_);
+  offset_+=ssize;
+
+  if(!PrepareNextField(vallen))
+    return 0;
+  CHECK_LE(offset_+vallen, bufsize_);
+  return vallen;
+}
+
+bool DataShard::Next(std::string *key, Message* val) {
+  int vallen=Next(key);
+  if(vallen==0)
+    return false;
+  val->ParseFromArray(buf_+offset_, vallen);
+  offset_+=vallen;
+  return true;
+}
+
+bool DataShard::Next(std::string *key, std::string* val) {
+  int vallen=Next(key);
+  if(vallen==0)
+    return false;
+  val->clear();
+  for(int i=0;i<vallen;i++)
+    val->push_back(buf_[offset_+i]);
+  offset_+=vallen;
+  return true;
+}
+
+void DataShard::SeekToFirst(){
+  CHECK_EQ(mode_, kRead);
+  bufsize_=0;
+  offset_=0;
+  fdat_.close();
+  fdat_.open(path_, std::ios::in|std::ios::binary);
+  CHECK(fdat_.is_open())<<"Cannot create file "<<path_;
+}
+
+// if the buf does not have the next complete field, read data from disk
+bool DataShard::PrepareNextField(int size){
+  if(offset_+size>bufsize_){
+    bufsize_-=offset_;
+    CHECK_LE(bufsize_, offset_);
+    for(int i=0;i<bufsize_;i++)
+      buf_[i]=buf_[i+offset_];
+    offset_=0;
+    if(fdat_.eof())
+      return false;
+    else{
+      fdat_.read(buf_+bufsize_, capacity_-bufsize_);
+      bufsize_+=fdat_.gcount();
+    }
+  }
+  return true;
+}
+
+const int DataShard::Count() {
+  std::ifstream fin(path_, std::ios::in|std::ios::binary);
+  CHECK(fdat_.is_open())<<"Cannot create file "<<path_;
+  int count=0;
+  while(true){
+    size_t len;
+    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+    if(fin.good())
+      fin.seekg(len, std::ios_base::cur);
+    else break;
+    if(fin.good())
+      fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+    else break;
+    if(fin.good())
+      fin.seekg(len, std::ios_base::cur);
+    else break;
+    if(!fin.good())
+      break;
+    count++;
+  }
+  fin.close();
+  return count;
+}
+
+int DataShard::PrepareForAppend(std::string path){
+  std::ifstream fin(path, std::ios::in|std::ios::binary);
+  if(!fin.is_open()){
+    fdat_.open(path, std::ios::out|std::ios::binary);
+    fdat_.flush();
+    fdat_.close();
+    return 0;
+  }
+
+  int last_tuple_offset=0;
+  char buf[256];
+  size_t len;
+  while(true){
+    memset(buf, 0, 256);
+    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+    if(fin.good())
+      fin.read(buf, len);
+    else break;
+    if(fin.good())
+      fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+    else break;
+    if(fin.good())
+      fin.seekg(len, std::ios_base::cur);
+    else break;
+    if(fin.good())
+      keys_.insert(std::string(buf));
+    else break;
+    last_tuple_offset=fin.tellg();
+  }
+  fin.close();
+  return last_tuple_offset;
+}
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/graph.cc
----------------------------------------------------------------------
diff --git a/src/utils/graph.cc b/src/utils/graph.cc
new file mode 100644
index 0000000..d1cece6
--- /dev/null
+++ b/src/utils/graph.cc
@@ -0,0 +1,148 @@
+#include <algorithm>
+#include "utils/graph.h"
+
+const string Graph::ToString() const {
+  map<string, string> info;
+  return ToString(info);
+}
+const string Graph::ToString(const map<string, string>& info) const {
+  map<string, int> nodeid;
+  string disp="{\"directed\":1,\n";
+
+  // add nodes
+  disp+="\"nodes\":[\n";
+  bool first=true;
+
+  vector<string> colors={"red", "blue", "black", "green"};
+  // see for more shapes at http://www.graphviz.org/doc/info/shapes.html
+  vector<string> shapes={"box", "ellipse"};
+  int id=0;
+  for(auto node: nodes_){
+    char str[1024];
+    string name=node->name();
+    string color=colors[(node->val().locationid)%colors.size()];
+    string shape;
+    string origin=node->val().origin;
+    if(origin=="kSlice"||origin=="kConcate"||origin=="kSplit"
+        ||origin=="kBridgeSrc"||origin=="kBridgeDst")
+      shape=shapes[1];
+    else
+      shape=shapes[0];
+    sprintf(str, "{\"id\":\"%s%s\", \"color\":\"%s\",\"shape\":\"%s\"}\n",
+        name.c_str(), info.find(name)!=info.end()?info.at(name).c_str():"",
+        color.c_str(), shape.c_str());
+    if(!first)
+      disp+=",";
+    else
+      first=false;
+    disp+=string(str);
+    nodeid[name]=id++;
+  }
+  disp+="]\n,";
+
+  // add edges
+  disp+="\"links\":[\n";
+  first=true;
+  for(auto src: nodes_)
+    for(auto dst: src->dstnodes()){
+    char str[1024];
+    sprintf(str, "{\"source\":%d, \"target\":%d, \"color\":\"%s\"}\n",
+        nodeid[src->name()], nodeid[dst->name()], "black");
+    if(!first)
+      disp+=",";
+    else
+      first=false;
+    disp+=string(str);
+  }
+  disp+="]\n";
+  return disp+"}";
+}
+bool Graph::Check() const {
+  return true;
+}
+
+
+// visited all dst nodes and then push current node into the stack
+void Graph::topology_sort_inner(SNode node,
+    map<string, bool> *visited,
+    std::stack<string> *stack) {
+  (*visited)[node->name()] = true;
+  const vector<SNode>& dstnodes=node->dstnodes();
+  for (auto it=dstnodes.rbegin();it!=dstnodes.rend();it++) {
+    if ((*visited)[(*it)->name()])
+      continue;
+    topology_sort_inner((*it),visited, stack);
+  }
+  stack->push(node->name());
+}
+
+// sort to make `bottom' nodes be placed in the front positions
+void Graph::Sort() {
+  // adjacent list from upper layers to lower layers
+  std::map<string, bool> visited;
+  // prepare adjacent list; input layers will be processed firstly,
+  // hence no need to sort them (mark them as visited)
+  for (SNode node: nodes_) {
+    visited[node->name()] = false;
+  }
+  // the `top' layer in the net will be placed at the bottom of the stack
+  // and then be processed (i.e., forward) at last
+  std::stack<string > stack;
+  for (SNode node: nodes_) {
+    if (visited[node->name()] == false)
+      topology_sort_inner(node, &visited, &stack);
+  }
+  nodes_.clear();
+
+  while (!stack.empty()) {
+    nodes_.push_back(name2node_[stack.top()]);
+    stack.pop();
+  }
+}
+
+
+
+SNode Graph::InsertSliceNode(SNode srcnode, const vector<SNode>& dstnodes,
+    const V& info, bool connect_dst){
+  V myinfo=info;
+  myinfo.origin="kSlice";
+  SNode node=AddNode("slice-"+srcnode->name(),myinfo);
+  AddEdge(srcnode, node);
+  if(connect_dst)
+    for(SNode dst: dstnodes)
+      AddEdge(node, dst);
+  return node;
+}
+SNode Graph::InsertConcateNode(const vector<SNode>&srcnodes, SNode dstnode,
+    const V& info){
+  V myinfo=info;
+  myinfo.origin="kConcate";
+  SNode node=AddNode("concate-"+dstnode->name(),myinfo);
+  AddEdge(node, dstnode);
+  for(SNode src: srcnodes)
+    AddEdge(src, node);
+  return node;
+}
+SNode Graph::InsertSplitNode(SNode srcnode, const vector<SNode>& dstnodes){
+  V myinfo=srcnode->val();
+  myinfo.origin="kSplit";
+  SNode node=AddNode("split-"+srcnode->name(), myinfo);
+  AddEdge(srcnode, node);
+  for(SNode dst: dstnodes)
+    AddEdge(node, dst);
+  return node;
+}
+std::pair<SNode, SNode> Graph::InsertBridgeNode(SNode srcnode, SNode dstnode){
+  LayerInfo info=srcnode->val();
+  info.origin="kBridgeSrc";
+  SNode src=AddNode("s-"+srcnode->name()+"-"+dstnode->name(), info);
+  info=dstnode->val();
+  info.origin="kBridgeDst";
+  SNode dst=AddNode("d-"+srcnode->name()+"-"+dstnode->name(), info);
+  AddEdge(srcnode, src);
+  AddEdge(src, dst);
+  AddEdge(dst, dstnode);
+  return pair<SNode, SNode>{src, dst};
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
new file mode 100644
index 0000000..d64c65d
--- /dev/null
+++ b/src/utils/param.cc
@@ -0,0 +1,345 @@
+#include <glog/logging.h>
+#include <cmath>
+#include <chrono>
+#include <random>
+#include "utils/param.h"
+#include "mshadow/tensor.h"
+#include "utils/singleton.h"
+using namespace mshadow;
+using std::vector;
+using std::string;
+namespace singa {
+
+Param::Param(){
+  owner_=-1;
+  fan_in_=0;
+  set_version(-1);
+}
+
+Param::~Param(){}
+
+Msg* Param::GenPutMsg(void* arg){
+  char buf[256];
+  int v=*(int*)arg;
+  sprintf(buf, "%d %d %f %f", v, size(),
+      learning_rate_multiplier(), weight_decay_multiplier());
+  Msg* msg=new Msg();
+  msg->set_type(kPut);
+  msg->add_frame(buf, strlen(buf));
+  msg->add_frame(mutable_cpu_data(), size()*sizeof(float));
+	return msg;
+}
+
+Msg* Param::GenGetMsg(void* arg){
+  char buf[10];
+  int v=*(int*)arg;
+  sprintf(buf, "%d", v);
+  Msg* msg=new Msg();
+  msg->set_type(kGet);
+  msg->add_frame(buf, strlen(buf));
+  return msg;
+}
+
+Msg* Param::GenUpdateMsg(void* arg){
+  char buf[10];
+  int v=*(int*)arg;
+  sprintf(buf, "%d", v);
+  Msg* msg=new Msg();
+  msg->set_type(kUpdate);
+  msg->add_frame(buf, strlen(buf));
+
+  msg->add_frame(mutable_cpu_grad(), size()*sizeof(float));
+  return msg;
+}
+
+Msg* Param::GenSyncMsg(void* arg){
+  return nullptr;
+}
+
+Msg* Param::HandlePutMsg(Msg** msg){
+  int v, size;
+  float lr, wc;
+  sscanf(static_cast<char*>((*msg)->frame_data()), "%d %d %f %f",
+      &v, &size, &lr, &wc);
+  set_version(v);
+  proto_.set_learning_rate_multiplier(lr);
+  proto_.set_weight_decay_multiplier(wc);
+  CHECK((*msg)->next_frame());
+  vector<int> shape{size};
+  data_.Reshape(shape);
+  grad_.Reshape(shape);
+  history_.Reshape(shape);
+  CHECK_EQ(size* sizeof(float), (*msg)->frame_size());
+  memcpy(data_.mutable_cpu_data(), (*msg)->frame_data(), size*sizeof(float));
+  delete (*msg);
+  *msg=nullptr;
+  return nullptr;
+}
+
+Msg* Param::HandleGetMsg(Msg** msg){
+  int v;
+  sscanf(static_cast<char*>((*msg)->frame_data()), "%d", &v);
+  CHECK_LE(v, version());
+  CHECK(!(*msg)->next_frame());
+  (*msg)->add_frame(data_.mutable_cpu_data(), sizeof(float)*size());
+  (*msg)->SwapAddr();
+  (*msg)->set_type(kRGet);
+  return *msg;
+}
+
+int Param::ParseUpdateMsg(Msg** msg){
+  int v;
+  sscanf(static_cast<char*>((*msg)->frame_data()), "%d", &v);
+  CHECK_LE(v, version());
+  CHECK((*msg)->next_frame());
+  memcpy(mutable_cpu_grad(), (*msg)->frame_data(),(*msg)->frame_size());
+  delete (*msg);
+  *msg=nullptr;
+  return 1;
+}
+
+Msg* Param::GenUpdateResponseMsg(void* arg){
+  Msg* msg=new Msg();
+  char buf[10];
+  sprintf(buf, "%d", version());
+  msg->set_type(kRUpdate);
+  msg->set_target(id());
+  msg->add_frame(buf, strlen(buf));
+  msg->add_frame(mutable_cpu_data(), size()*sizeof(float));
+  return msg;
+}
+
+
+Msg* Param::HandleSyncMsg(Msg** msg){
+  delete *msg;
+  *msg=nullptr;
+  return nullptr;
+}
+
+int Param::ParseSyncResponseMsg(Msg** msg){
+  delete *msg;
+  *msg=nullptr;
+  return 1;
+}
+int Param::ParsePutResponseMsg(Msg **msg){
+  return ParseSyncResponseMsg(msg);
+}
+int Param::ParseGetResponseMsg(Msg **msg){
+  int v;
+  sscanf(static_cast<char*>((*msg)->frame_data()), "%d", &v);
+  set_version(v);
+  CHECK((*msg)->next_frame());
+  memcpy(mutable_cpu_data(), (*msg)->frame_data(), (*msg)->frame_size());
+  return 1;
+}
+int Param::ParseUpdateResponseMsg(Msg **msg){
+  return ParseGetResponseMsg(msg);
+}
+
+void Param::Setup(const ParamProto& proto, const vector<int>& shape,
+    int fan_in){
+  data_.Reshape(shape);
+  grad_.Reshape(shape);
+  history_.Reshape(shape);
+  proto_=proto;
+  fan_in_=fan_in;
+}
+
+void Param::Init(int v){
+  proto_.set_version(v);
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+  auto random=ASingleton<Random<cpu>>::Instance(seed);
+  switch (proto_.init_method()) {
+  case ParamProto::kConstant:
+    data=proto_.value();
+    break;
+  case ParamProto::kUniform:
+    random->SampleUniform(data, proto_.low(), proto_.high());
+    if(proto_.value())
+      data*= proto_.value();
+    break;
+  case ParamProto::kUniformSqrtFanIn:
+    CHECK_GT(fan_in_,0);
+    random->SampleUniform(data, proto_.low(), proto_.high());
+    if(proto_.value())
+      data*= proto_.value()/ sqrt(fan_in_ / 3.0f);
+    break;
+  case ParamProto::kUniformSqrtFanInOut:
+    random->SampleUniform(data, proto_.low(), proto_.high());
+    if(proto_.value())
+      data*= proto_.value()/ sqrt(data_.shape()[0] +data_.shape()[1]);
+    break;
+  case ParamProto::kGaussian:
+    random->SampleGaussian(data, proto_.mean(), proto_.std());
+    if(proto_.value())
+      data*= proto_.value();
+    break;
+  case ParamProto::kGaussainSqrtFanIn:
+    random->SampleGaussian(data, proto_.mean(), proto_.std());
+    if(proto_.value())
+      data*= proto_.value()/ sqrt(data_.shape()[0]);
+    break;
+  default:
+    LOG(ERROR) << "Illegal parameter init method ";
+    break;
+  }
+}
+
+/**************************RandomSyncParam********************************
+const vector<int> RandomSyncParam::RandomSample(int seed, int m, int n){
+  vector<int> samples(m);
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> dist(0.f,1.f);
+  for(int i=0,k=0;i<n&&k<m;i++)
+    if((m-k)*1.0f/(n-i)>dist(gen)){
+      samples[k++]=i;
+    }
+  return samples;
+}
+
+zmsg_t* RandomSyncParam::HandleSyncMsg(zmsg_t** msg){
+  int64_t start=zclock_mono();
+  char* control=zframe_strdup(zmsg_first(*msg));
+  int seed, count;
+  sscanf(control, "%d-%d", &seed,&count);
+  delete control;
+  zframe_t* syncframe=zmsg_next(*msg);
+  CHECK_EQ(zframe_size(syncframe), count*sizeof(float));
+  float* syncptr=(float*)zframe_data(syncframe);
+  float* dptr=data_.mutable_cpu_data();
+  int k=0;
+  if(count==data_.count()){
+    for(int idx=0;idx<count;idx++){
+      float x=dptr[idx];
+      dptr[idx]+=syncptr[k];
+      syncptr[k]=x;
+      k++;
+    }
+  }else{
+    for(int idx: RandomSample(seed, count, data_.count())){
+      float x=dptr[idx];
+      dptr[idx]+=syncptr[k];
+      syncptr[k]=x;
+      k++;
+    }
+  }
+  CHECK_EQ(k,count);
+  CHECK_EQ(zframe_size(syncframe), count*sizeof(float));
+  return *msg;
+}
+
+zmsg_t *RandomSyncParam::GenSyncMsgFromWorker(float sample_ratio){
+  int64_t start=zclock_mono();
+  zmsg_t* msg=zmsg_new();
+  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+  int m=data_.count()*sample_ratio;
+  zmsg_addstrf(msg, "%u-%d", seed, m);
+  float* updateptr=new float[m];
+  float* dptr=data_.mutable_cpu_data();
+  float* sdptr=snapshot_.mutable_cpu_data();
+  int k=0;
+  if(m==data_.count()){
+    for(int idx=0;idx<m;idx++)
+      updateptr[k++]=dptr[idx]-sdptr[idx];
+  }else{
+    const vector<int> samples=RandomSample(seed, m, data_.count());
+    for(int idx:samples){
+      updateptr[k++]=dptr[idx]-sdptr[idx];
+    }
+  }
+  CHECK_EQ(k,m);
+  zframe_t* frame=zframe_new(updateptr, sizeof(float)*m);
+  zmsg_append(msg, &frame);
+  delete updateptr;
+  worker_gen_sync+=zclock_mono()-start;
+  return msg;
+}
+
+void RandomSyncParam::ParseSyncMsgFromPS(zmsg_t** msg){
+  int64_t start=zclock_mono();
+  //LOG(ERROR)<<"worker sync "<<id();
+  char* control=zmsg_popstr(*msg);
+  int seed, count;
+  sscanf(control, "%u-%d", &seed, &count);
+  //LOG(ERROR)<<"worker sync "<<id()<<" "<<control;
+  delete control;
+  zframe_t* psdataframe=zmsg_pop(*msg);
+  CHECK_EQ(zframe_size(psdataframe), count*sizeof(float));
+  float* psdptr=(float*)zframe_data(psdataframe);
+  float* dptr=data_.mutable_cpu_data();
+  float* sdptr=snapshot_.mutable_cpu_data();
+  int k=0;
+  if(count==data_.count()){
+    for(int idx=0;idx<count;idx++){
+      dptr[idx]+=psdptr[k++]-sdptr[idx];
+      sdptr[idx]=dptr[idx];
+    }
+  }else{
+    for(int idx: RandomSample(seed, count, data_.count())){
+      dptr[idx]+=psdptr[k++]-sdptr[idx];
+      sdptr[idx]=dptr[idx];
+    }
+  }
+  zframe_destroy(&psdataframe);
+  worker_handle_sync+=zclock_mono()-start;
+  zmsg_destroy(msg);
+}
+
+
+void RandomSyncParam::Setup(const ParamProto& proto, const vector<int>& shape,
+    int fan_in){
+  Param::Setup(proto, shape, fan_in);
+  snapshot_.Reshape(shape);
+}
+
+void RandomSyncParam::Init(){
+  Param::Init();
+  memcpy(snapshot_.mutable_cpu_data(), data_.mutable_cpu_data(),
+      sizeof(float)*data_.count());
+}
+*/
+
+/***************************ElasticParam************************************
+zmsg_t* ElasticParam::HandleSyncMsg(zmsg_t** msg){
+  int64_t start=zclock_mono();
+  char* control=zframe_strdup(zmsg_first(*msg));
+  float alpha;int count;
+  sscanf(control, "%f-%d", &alpha,&count);
+  delete control;
+  zframe_t* syncframe=zmsg_next(*msg);
+  CHECK_EQ(size(), count);
+  Tensor<cpu, 1> server(data_.mutable_cpu_data(), Shape1(count));
+  Tensor<cpu, 1> worker((float*)zframe_data(syncframe), Shape1(count));
+  worker=(worker-server)*alpha;
+  server+=worker;
+  return *msg;
+}
+
+zmsg_t *ElasticParam::GenSyncMsgFromWorker(float alpha){
+  int64_t start=zclock_mono();
+  zmsg_t* msg=zmsg_new();
+  zmsg_addstrf(msg, "%f-%d", alpha, size());
+  zmsg_addmem(msg, mutable_cpu_data(), sizeof(float)*size());
+  worker_gen_sync+=zclock_mono()-start;
+  return msg;
+}
+
+void ElasticParam::ParseSyncMsgFromPS(zmsg_t** msg){
+  int64_t start=zclock_mono();
+  //LOG(ERROR)<<"worker sync "<<id();
+  char* control=zmsg_popstr(*msg);
+  float alpha;int count;
+  sscanf(control, "%f-%d", &alpha, &count);
+  delete control;
+  zframe_t* frame=zmsg_pop(*msg);
+  CHECK_EQ(zframe_size(frame), count*sizeof(float));
+  Tensor<cpu, 1> diff((float*)zframe_data(frame), Shape1(count));
+  Tensor<cpu, 1> data(mutable_cpu_data(), Shape1(count));
+  data-=diff;
+  zframe_destroy(&frame);
+  zmsg_destroy(msg);
+  worker_handle_sync+=zclock_mono()-start;
+}
+*/
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
new file mode 100644
index 0000000..0b89ee8
--- /dev/null
+++ b/src/utils/updater.cc
@@ -0,0 +1,192 @@
+
+#include "utils/updater.h"
+#include "mshadow/tensor.h"
+#include "mshadow/cxxnet_op.h"
+#include "proto/model.pb.h"
+using namespace mshadow;
+using namespace mshadow::expr;
+
+namespace  singa {
+
+float Updater::GetLearningRate(int step){
+  float ret = 0., r = 0., base=proto_.base_learning_rate();
+  int freq=0;
+  switch (proto_.learning_rate_change_method()) {
+    case UpdaterProto_ChangeProto_kFixed:
+      ret = base;
+      break;
+    case UpdaterProto_ChangeProto_kLinear:
+      // a is init, b is the final
+      freq=proto_.learning_rate_change_frequency();
+      r = step * 1.0  / freq;
+      ret = (1.0 - r) * base + r * proto_.final_learning_rate();
+      break;
+    case UpdaterProto_ChangeProto_kExponential:
+      // a is init, b is the final, from convnet
+      CHECK_EQ(base, 2 * proto_.final_learning_rate())
+        << "final value should be the half";
+      freq=proto_.learning_rate_change_frequency();
+      ret = base / pow(2, step * 1. / freq);
+      break;
+    case UpdaterProto_ChangeProto_kInverse_t:
+      // a is init, b is the final, from convnet
+      CHECK_EQ(base, 2 * proto_.final_learning_rate())
+        << "final value should be the half";
+      ret = base / (1. + step * 1. / proto_.final_learning_rate());
+      break;
+    case UpdaterProto_ChangeProto_kInverse:
+      // a is init, b is gamma, c is pow
+      ret=base*pow(1.f+proto_.gamma()*step, -proto_.pow());
+      break;
+    case UpdaterProto_ChangeProto_kStep:
+      // a is the base learning rate, b is gamma, from caffe
+      // notice it is step/change_steps, not step*1.0/change_steps
+      freq=proto_.learning_rate_change_frequency();
+      ret = base * pow(proto_.gamma(), step / freq);
+      break;
+    case UpdaterProto_ChangeProto_kFixedStep:
+      for(size_t i=0;i<proto_.step_size();i++){
+        if(step>proto_.step(i))
+          ret=proto_.step_lr(i);
+      }
+      break;
+    default:
+      LOG(ERROR) << "Wrong hyper-parameter update method";
+  }
+  return ret;
+}
+
+/***********************SGD with momentum******************************/
+void SGDUpdater::Init(const UpdaterProto& proto){
+  Updater::Init(proto);
+  base_lr_=proto.base_learning_rate();
+  //CHECK_GT(base_lr_, 0);
+  momentum_=proto.momentum();
+  weight_decay_=proto.weight_decay();
+}
+
+void SGDUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd=weight_decay_*param->weight_decay_multiplier();
+  if(wd>0){ // L2 regularization
+    grad+=data*wd;
+  }
+  if(momentum_>0){
+    Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+    if(step==0) history=0;
+    history=history*momentum_-lr*grad;
+    data+=history;
+  }else{
+    grad*=-lr;
+    data+=grad;
+  }
+}
+
+/***********************Nesterov******************************/
+void NesterovUpdater::Init(const UpdaterProto& proto){
+  Updater::Init(proto);
+  base_lr_=proto.base_learning_rate();
+  CHECK_GT(base_lr_, 0);
+  weight_decay_=proto.weight_decay();
+}
+
+void NesterovUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  TensorContainer<cpu, 1> tmp(s);
+  if(step==0) history=0;
+  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd=weight_decay_*param->weight_decay_multiplier();
+  if(wd>0){ // L2 regularization
+    grad+=data*wd;
+  }
+  Copy(tmp, history);
+  history=history*momentum_+lr*grad;
+  tmp=history*(1+momentum_)-tmp*momentum_;
+  data-=tmp;
+}
+/***********************AdaGrad******************************/
+void AdaGradUpdater::Init(const UpdaterProto& proto){
+  Updater::Init(proto);
+  base_lr_=proto.base_learning_rate();
+  CHECK_GT(base_lr_, 0);
+  delta_=proto.delta();
+  weight_decay_=proto.weight_decay();
+}
+
+void AdaGradUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  if(step==0) history=0;
+  history+=F<op::square>(grad*grad_scale);
+  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd=weight_decay_*param->weight_decay_multiplier();
+  if(wd>0){ // L2 regularization
+    grad+=data*wd;
+  }
+  data-=lr*grad/(F<op::sqrtop>(history,delta_));
+}
+
+/***********************RMSProp******************************/
+void RMSPropUpdater::Init(const UpdaterProto& proto){
+  Updater::Init(proto);
+  base_lr_=proto.base_learning_rate();
+  CHECK_GT(base_lr_, 0);
+  delta_=proto.delta();
+  rho_=proto.rho();
+  weight_decay_=proto.weight_decay();
+}
+
+void RMSPropUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  if(step==0) history=0;
+  history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
+  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
+  float wd=weight_decay_*param->weight_decay_multiplier();
+  if(wd>0){ // L2 regularization
+    grad+=data*wd;
+  }
+  data-=lr*grad/(F<op::sqrtop>(history,delta_));
+}
+
+/***********************AdaDelta******************************
+void AdaDeltaUpdater::Init(const UpdaterProto& proto){
+  Updater::Init(proto);
+  delta_=proto.delta();
+  rho_=proto.rho();
+  weight_decay_=proto.weight_decay();
+}
+
+void AdaDeltaUpdater::Update(int step, shared_ptr<Param> param, float grad_scale){
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
+  TensorContainer<cpu, 1> tmp(s);
+  float wd=weight_decay_*param->weight_decay_multiplier();
+  if(wd>0){ // L2 regularization
+    grad+=data*wd;
+  }
+  if(step==0){
+    history=0;
+    update=0;
+  }
+  history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
+  tmp=grad*F<op::sqrtop>(update, delta_)/F<op::sqrtop>(history, delta_);
+  update=rho_*update+(1-rho_)*F<op::square>(tmp);
+  data-=tmp;
+}
+*/
+
+} /* singa */


[07/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_random.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h
new file mode 100644
index 0000000..b3f0b84
--- /dev/null
+++ b/include/mshadow/tensor_random.h
@@ -0,0 +1,299 @@
+#ifndef MSHADOW_TENSOR_RANDOM_H
+#define MSHADOW_TENSOR_RANDOM_H
+/*!
+ *  \file tensor_random.h
+ *  \brief Random inline functions for tensor.
+ *  \author Bing Xu, Tianqi Chen
+ *   Based on curand|MKL|stdlib
+ */
+#include <cstdlib>
+#include "tensor.h"
+#include "tensor_container.h"
+
+namespace mshadow {
+    /*! 
+     * \brief random number generator 
+     * \tparam Device the device of random number generator
+     */
+    template<typename Device>
+    class Random {};
+
+    /*! \brief CPU random number generator */
+    template<>
+    class Random<cpu> {
+    public:
+        /*!
+         * \brief constructor of random engine
+         * \param seed random number seed
+         */
+        Random<cpu>( int seed ){
+            #if MSHADOW_USE_MKL
+            int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
+            utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine failed to be initialized.\n" );
+            #else
+            srand(seed);
+            #endif
+            buffer_.Resize( Shape1( kRandBufferSize ) );
+        }
+        ~Random<cpu>() {
+            #if MSHADOW_USE_MKL
+            vslDeleteStream(&vStream_);
+            #endif
+        }
+        /*!
+         * \brief seed random number generator using this seed
+         * \param seed seed of prng
+         */
+        inline void Seed( int seed ){
+            #if MSHADOW_USE_MKL
+            int status = vslDeleteStream(&vStream_);
+            utils::Assert(status == VSL_STATUS_OK);
+            status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
+            utils::Assert(status == VSL_STATUS_OK);
+            #else
+            srand( seed );
+            #endif
+        }
+        /*!
+         * \brief generate data from uniform [a,b)
+         * \param dst destination
+         * \param a lower bound of uniform
+         * \param b upper bound of uniform
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline void SampleUniform( Tensor<cpu, dim> &dst, real_t a=0.0f, real_t b=1.0f ) {
+            Tensor<cpu, 2> mat = dst.FlatTo2D();
+            for ( index_t i = 0; i < mat.shape[1]; ++i ) {
+                #if MSHADOW_USE_MKL
+                #if MSHADOW_SINGLE_PRECISION
+                int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
+                #else
+                int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
+                #endif
+                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
+                #else
+                // use stdlib
+                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
+                    mat[i][j] = this->RandNext()*(b-a) + a;
+                }
+                #endif
+            }
+        }
+        /*!
+         * \brief generate data from standard gaussian
+         * \param dst destination
+         * \param mu mean variable
+         * \param sigma standard deviation
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline void SampleGaussian( Tensor<cpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f ) {
+            if( sigma <= 0.0f ) {
+                dst = mu; return;
+            }
+            Tensor<cpu, 2> mat = dst.FlatTo2D();
+            for (index_t i = 0; i < mat.shape[1]; ++i) {
+                #if MSHADOW_USE_MKL
+                #if MSHADOW_SINGLE_PRECISION
+                int status = vsRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
+                #else
+                int status = vdRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
+                #endif
+                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
+                #else
+                real_t g1 = 0.0f, g2 = 0.0f;
+                for (index_t j = 0; j < mat.shape[0]; ++j) {
+                    if( (j & 1) == 0 ){
+                        this->SampleNormal2D( g1, g2 );
+                        mat[i][j] = mu + g1 * sigma;
+                    }else{
+                        mat[i][j] = mu + g2 * sigma;
+                    }
+                }
+                #endif
+            }
+        }
+        /*!
+         * \brief return a temporal expression storing standard gaussian random variables
+         *        the temporal tensor is only valid before next call of gaussian or uniform
+         *        can be used as part of expression
+         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+         *           since second call of gaussian(s2) makes gaussian(s1) invalid
+         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+         * \param shape shape of the tensor
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> gaussian( Shape<dim> shape ){
+            buffer_.Resize( Shape1( shape.Size() ) );
+            this->SampleGaussian( buffer_, 0.0f, 1.0f );
+            return expr::reshape( buffer_, shape );
+        }
+        /*!
+         * \brief return a temporal expression storing standard uniform [0,1)
+         *        the temporal tensor is only valid before next call of gaussian or uniform
+         *        can be used as part of expression
+         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+         *           since second call of gaussian(s2) makes gaussian(s1) invalid
+         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+         * \param shape shape of the tensor
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> uniform( Shape<dim> shape ){
+            buffer_.Resize( Shape1( shape.Size() ) );
+            this->SampleUniform( buffer_, 0.0f, 1.0f );
+            return expr::reshape( buffer_, shape );
+        }
+    private:
+        /*! \brief get next random number from rand */
+        inline real_t RandNext( void ){
+            return static_cast<real_t>(rand()) / (static_cast<real_t>(RAND_MAX)+1.0f);
+        }
+        /*! \brief return a real numer uniform in (0,1) */
+        inline real_t RandNext2( void ){
+            return (static_cast<real_t>( rand() ) + 1.0 ) / (static_cast<real_t>(RAND_MAX) + 2.0);
+        }
+        /*!
+         * \brief sample iid xx,yy ~N(0,1)
+         * \param xx first  gaussian output
+         * \param yy second gaussian output
+         */
+        inline void SampleNormal2D( real_t &xx, real_t &yy ){
+            real_t x,y,s;
+            do{
+                x = 2.0f * RandNext2() - 1.0f;
+                y = 2.0f * RandNext2() - 1.0f;
+                s = x*x + y*y;
+            }while( s >= 1.0f || s == 0.0f );
+            real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ;
+            xx = x * t; yy = y * t;
+        }
+    private:
+        #if MSHADOW_USE_MKL
+        /*! \brief stream used by MKL VSL */
+        VSLStreamStatePtr vStream_;
+        #endif
+        /*! \brief temporal space used to store random numbers */
+        TensorContainer<cpu,1> buffer_;
+    }; // class Random<cpu>
+
+#ifdef __CUDACC__
+
+    /*! \brief GPU random number generator */
+    template<>
+    class Random<gpu> {
+    public:
+        /*!
+         * \brief constructor of random engine
+         * \param seed random number seed
+         */
+        Random<gpu>(int seed) {
+            curandStatus_t status;
+            status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
+            utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create CURAND Generator");
+            this->Seed( seed );
+            buffer_.Resize( Shape1(kRandBufferSize) );
+        }
+
+        ~Random<gpu>() {
+            curandStatus_t status;
+            status = curandDestroyGenerator(gen_);
+            utils::Assert(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed");
+        }
+        /*!
+         * \brief seed random number generator using this seed
+         * \param seed seed of prng
+         */
+        inline void Seed( int seed ){
+            curandStatus_t status;
+            status = curandSetPseudoRandomGeneratorSeed(gen_, seed);
+            utils::Assert(status == CURAND_STATUS_SUCCESS, "Set CURAND seed failed.");
+        }
+        /*!
+         * \brief generate data from uniform [a,b)
+         * \param dst destination
+         * \param a lower bound of uniform
+         * \param b upper bound of uniform
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline void SampleUniform(Tensor<gpu, dim> &dst, real_t a=0.0f, real_t b=1.0f) {
+            if( a == 0.0f && b == 1.0f ){
+                dst = this->uniform( dst.shape );
+            }else{
+                dst = this->uniform( dst.shape ) *(b-a) + a;
+            }
+        }
+        /*!
+         * \brief generate data from standard gaussian
+         * \param dst destination
+         * \param mu mean variable
+         * \param sigma standard deviation
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline void SampleGaussian(Tensor<gpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f) {
+            dst = this->gaussian( dst.shape, mu, sigma );
+        }
+        /*!
+         * \brief return a temporal expression storing standard gaussian random variables
+         *        the temporal tensor is only valid before next call of gaussian or uniform
+         *        can be used as part of expression
+         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+         *           since second call of gaussian(s2) makes gaussian(s1) invalid
+         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+         * \param shape shape of the tensor
+         * \param mu mean
+         * \param sigma variance
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> gaussian( Shape<dim> shape, real_t mu=0.0f, real_t sigma=1.0f){
+            size_t aligned_sz = ((shape.Size() + 1UL)>>1)<<1;
+            // allocate alligned size
+            buffer_.Resize( Shape1( aligned_sz ) );
+            buffer_.Resize( Shape1( shape.Size() ) );
+            curandStatus_t status;
+            #if MSHADOW_SINGLE_PRECISION
+            status = curandGenerateNormal(gen_, buffer_.dptr, aligned_sz , mu, sigma);
+            #else
+            status = curandGenerateNormalDouble(gen_, buffer_.dptr, buffer_.shape[0], mu, sigma);
+            #endif
+            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
+            return expr::reshape( buffer_, shape );
+        }
+        /*!
+         * \brief return a temporal expression storing standard uniform [0,1)
+         *        the temporal tensor is only valid before next call of gaussian or uniform
+         *        can be used as part of expression
+         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
+         *           since second call of gaussian(s2) makes gaussian(s1) invalid
+         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
+         * \param shape shape of the tensor
+         * \tparam dim dimension of tensor
+         */
+        template<int dim>
+        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> uniform(Shape<dim> shape) {
+            buffer_.Resize( Shape1( shape.Size() ) );
+            curandStatus_t status;
+            #if MSHADOW_SINGLE_PRECISION
+            status = curandGenerateUniform(gen_, buffer_.dptr, buffer_.shape[0] );
+            #else
+            status = curandGenerateUniformDouble(gen_, buffer_.dptr, buffer_.shape[0] );
+            #endif
+            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
+            return expr::reshape( buffer_, shape );
+        }
+    private:
+        /*! \brief random numbeer generator */
+        curandGenerator_t gen_;
+        /*! \brief templ buffer */
+        TensorContainer<gpu, 1> buffer_;
+    }; // class Random<gpu>
+    #endif
+
+}; // namespace mshadow
+
+#endif // MSHADOW_TENSOR_RANDOM_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_sse-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_sse-inl.hpp b/include/mshadow/tensor_sse-inl.hpp
new file mode 100644
index 0000000..b98383e
--- /dev/null
+++ b/include/mshadow/tensor_sse-inl.hpp
@@ -0,0 +1,431 @@
+#ifndef MSHADOW_TENSOR_SSE_INL_HPP
+#define MSHADOW_TENSOR_SSE_INL_HPP
+/*!
+ * \file tensor_sse-inl.hpp
+ * \brief support of sse2 optimization of some operations
+ * \author Tianqi Chen
+ */
+#ifdef __APPLE__
+#include <stdlib.h>
+#else
+#include <malloc.h>
+#endif
+
+#include "tensor_expr.h"
+#include "tensor.h"
+
+namespace mshadow {
+    /*! \brief namespace to support sse2 vectorization */
+    namespace sse2{
+        /*! 
+         * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
+         * \param pitch output parameter, the actuall space allocated for each line
+         * \param lspace number of cells required for each line
+         * \param num_line number of lines to be allocated
+         */
+        inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){
+            pitch = ((lspace+15) >> 4) << 4;
+            #ifdef _MSC_VER
+            void * res = _aligned_malloc( pitch*num_line, 16 ); 
+            #else
+            #ifdef __APPLE__
+            void *res = malloc( pitch * num_line );
+            #else
+            void * res = memalign( 16, pitch*num_line ); 
+            #endif
+            #endif
+            utils::Assert( res != NULL, "AlignedMallocPitch failed" );
+            return res;
+        }
+        /*! 
+         * \brief free aligned space 
+         * \param ptr pointer to space to be freed
+         */
+        inline void AlignedFree( void *ptr ){
+            #ifdef _MSC_VER
+            _aligned_free( ptr );
+            #else
+            free( ptr );
+            #endif
+        }
+        /*! \brief check if a pointer is aligned */
+        inline bool CheckAlign( size_t pitch ){
+            return !(pitch & ((1<<4)-1));
+        }
+        /*! \brief check if a pointer is aligned */
+        inline bool CheckAlign( void *ptr ){
+            return CheckAlign( (size_t)ptr );
+        }
+        /*! 
+         * \brief get upper bound of aligned index of size 
+         * \param size size of the array
+         * \param fsize size of float
+         */
+        inline index_t UpperAlign( index_t size, size_t fsize ){
+            return (( (size*fsize+15) >> 4 ) << 4) / fsize;
+        }
+        /*! 
+         * \brief get lower bound of aligned index of size 
+         * \param size size of the array
+         * \param fsize size of float
+         */
+        inline index_t LowerAlign( index_t size, size_t fsize ){
+            return (( (size*fsize) >> 4 ) << 4) / fsize;
+        }
+    }; // namespace sse2
+}; // namespace  mshadow
+
+#if MSHADOW_USE_SSE
+// sse types are not compatible with nvcc, only use them in cpu mode
+#include <emmintrin.h>
+
+namespace mshadow{
+    namespace sse2{
+        /*! 
+         * \brief float vector real type, used for vectorization 
+         * \tparam FloatType double or float
+         */
+        template<typename FloatType> struct FVec{};
+        
+        /*! \brief vector real type for float */
+        template<> 
+        struct FVec<float> {
+        public:
+            typedef __m128 DType;
+            /*! \brief number of float in vector */
+            const static index_t kSize = 4;
+            /*! \brief data content */
+            DType data_;
+        public:
+            /* constructors */
+            FVec( void ){}
+            FVec( DType data ):data_(data){}
+            /* set the float */
+            FVec( const float &s ){
+                data_ = _mm_set1_ps( s );
+            }
+            /*!\brief load from pointer src */
+            FVec( const float *src ){
+                data_ = _mm_load_ps( src );                
+            } 
+        public:
+            /*! \brief store data into dst space */
+            inline void Store( float *dst ) const{
+                return _mm_store_ps( dst, data_ );
+            }
+            /*! \brief sum of all content */
+            inline float Sum( void ) const{
+                DType ans  = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
+                DType rst  = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
+                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
+                return rst.m128_f32[ 0 ];
+                #else
+                float rr = _mm_cvtss_f32( rst ) ;
+                return rr;
+                #endif
+            }
+        };
+
+        /*! \brief vector real type for float */
+        template<> 
+        struct FVec<double> {
+        public:
+            typedef __m128d DType;
+            /*! \brief number of float in vector */
+            const static index_t kSize = 2;
+            /*! \brief data content */
+            DType data_;
+        public:
+            /* constructors */
+            FVec( void ){}
+            FVec( DType data ):data_(data){}
+            /* set the float */
+            FVec( const double &s ){
+                data_ = _mm_set1_pd( s );
+            }
+            /*!\brief load from pointer src */
+            FVec( const double *src ){
+                data_ = _mm_load_pd( src );                
+            } 
+        public:
+            /*! \brief store data into dst space */
+            inline void Store( double *dst ) const{
+                return _mm_store_pd( dst, data_ );
+            }
+            /*! \brief sum of all content */
+            inline double Sum( void ) const{
+                DType tmp =  _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
+                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
+                return tmp.m128d_f64[0];
+                #else
+                double ans = _mm_cvtsd_f64( tmp );
+                return ans;
+                #endif
+            }
+        };
+    };
+
+    namespace sse2{
+        /*! \brief sse2 operator type of certain operator */
+        template<typename OP>
+        struct SSEOp{
+            const static bool kEnabled = false;
+        };        
+        template<>
+        struct SSEOp<op::plus>{
+            const static bool kEnabled = true;
+            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
+                return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );
+            }
+            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
+                return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );
+            }
+        };
+        template<>
+        struct SSEOp<op::minus>{
+            const static bool kEnabled = true;
+            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
+                return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );
+            }
+            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
+                return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );
+            }
+        };
+        template<>
+        struct SSEOp<op::mul>{
+            const static bool kEnabled = true;
+            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
+                return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );
+            }
+            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
+                return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );
+            }
+        };
+        template<>
+        struct SSEOp<op::div>{
+            const static bool kEnabled = true;
+            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
+                return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );
+            }
+            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
+                return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );
+            }
+        };
+
+        template<>
+        struct SSEOp<op::identity>{
+            const static bool kEnabled = true;
+            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){
+                return src;
+            }
+            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){
+                return src;
+            }
+        };
+    }; // namespace sse2
+    
+    namespace sse2{
+        // savers to do storage
+        template<typename SV, typename TFloat>
+        struct Saver{
+            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
+                FVec<TFloat> lhs( dst );
+                FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map( lhs, src );
+                ans.Store( dst );
+            }
+        };
+        template<typename TFloat>
+        struct Saver<sv::saveto,TFloat>{
+            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
+                src.Store( dst );
+            }
+        };        
+    }; // namespace sse2
+}; // namespace mshadow
+
+namespace mshadow{
+    namespace expr{
+        // same as plan, but use sse2
+        template<typename ExpType>
+        class SSEPlan {
+        public:
+            /*!
+             * \brief evaluate the expression at index [y][x], x will be aligned to 4
+             *        to be implemented by SubType
+             */
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const;
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;
+        };
+
+        template <typename Device, int dim>
+        class SSEPlan< Tensor<Device,dim> >{
+        public:
+            SSEPlan( const Tensor<Device,dim> &t )
+                :dptr_(t.dptr),stride_(t.shape.stride_){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
+                return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ y * stride_ + x ];
+            }
+        private:
+            const real_t  *dptr_;
+            index_t stride_;
+        };
+
+        template<>
+        class SSEPlan<ScalarExp>{
+        public:
+            SSEPlan( real_t scalar ):scalar_(scalar){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
+                return sse2::FVec<real_t>( scalar_ );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return scalar_;
+            }
+        private:
+            real_t scalar_;
+        };
+
+        template<typename OP, typename TA, typename TB,int etype>
+        class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{
+        public:
+            SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
+                return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
+            }
+        private:
+            SSEPlan<TA> lhs_;
+            SSEPlan<TB> rhs_;
+        };
+
+        template<typename OP, typename TA, int etype>
+        class SSEPlan< UnaryMapExp<OP,TA,etype> >{
+        public:
+            SSEPlan( const SSEPlan<TA> &src ):src_(src){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
+                return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( src_.Eval( y, x ) );
+            }
+        private:
+            SSEPlan<TA> src_;
+        };
+
+        template<typename OP, typename TA, typename TB, int etype>
+        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e );
+
+        inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){
+            return SSEPlan<ScalarExp>( e.scalar_ );
+        }
+
+        template<typename T>
+        inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){
+            return SSEPlan<T>( e.self() );
+        }
+
+        template<typename T,int dim>
+        inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){
+            return SSEPlan<T>( e.real_self() );
+        }
+
+        template<typename OP, typename TA, int etype>
+        inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){
+            return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
+        }
+
+        template<typename OP, typename TA, typename TB, int etype>
+        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
+                return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
+        }
+    };
+
+    namespace expr{
+        /*!
+         * \brief static check sse enable
+         *        if a expression E can not be evaluated using sse, then kPass = false
+         * \tparam Device the type of Device
+         * \tparam dim dimension of the tensor
+         * \tparam E expression
+         */
+        template<typename E>
+        struct SSECheck{
+            const static bool kPass = false;
+        };
+        template<>
+        struct SSECheck<ScalarExp>{
+            const static bool kPass = true;
+        };
+        template<int dim>
+        struct SSECheck<Tensor<cpu,dim> >{
+            const static bool kPass = true;
+        };
+        
+        template<typename OP, typename TA, int etype>
+        struct SSECheck<UnaryMapExp<OP,TA,etype> >{
+            const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
+        };
+        template<typename OP, typename TA, typename TB, int etype>
+        struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{
+            const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
+        }; 
+    }; // namespace expr
+    namespace expr{
+        // check if data is aligned and allow sse operation
+        template<int dim,typename E>
+        struct SSEAlignCheck{
+            inline static bool Check( const E &exp ){
+                return false;
+            }
+        };
+        template<int dim>
+        struct SSEAlignCheck< dim, ScalarExp >{
+            inline static bool Check( const ScalarExp &exp ){
+                return true;
+            }
+        };
+        template<int dim>
+        struct SSEAlignCheck< dim,Tensor<cpu,dim> >{
+            inline static bool Check( const Tensor<cpu,dim> &t ){
+                return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );
+            }
+        };
+        template<int dim, typename OP, typename TA, int etype>
+        struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{
+            inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){
+                return SSEAlignCheck<dim,TA>::Check( t.src_);
+            }
+        };
+        template<int dim, typename OP, typename TA, typename TB, int etype>
+        struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{ 
+            inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
+                return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) && 
+                    SSEAlignCheck<dim,TB>::Check( t.rhs_ );
+            }
+        };
+    }; // namespace expr
+
+    /*! 
+     * \brief use SSEPlan to compute result
+     */
+    template<typename SV, typename E, int dim>
+    inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){        
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );
+        for ( index_t y = 0; y < dst.shape[1]; y ++ ) {
+            for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
+                sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );
+            }
+            for( index_t x = xlen; x < dst.shape[0]; x ++ ){
+                SV::Save( dst[y][x], plan.Eval(y,x) );
+            }
+        }
+    }
+}; // namespace mshadow
+#endif // MSHADOW_USE_SSE
+#endif // MSHADOW_TENSOR_SSE_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
new file mode 100644
index 0000000..863c223
--- /dev/null
+++ b/include/neuralnet/base_layer.h
@@ -0,0 +1,563 @@
+#ifndef INCLUDE_BASE_LAYER_H_
+#define INCLUDE_BASE_LAYER_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+#include <condition_variable>
+#include <mutex>
+#include <memory>
+#include <chrono>
+#include <algorithm>
+
+#include "proto/model.pb.h"
+#include "utils/param.h"
+#include "utils/common.h"
+#include "utils/blob.h"
+
+using std::vector;
+using std::shared_ptr;
+using std::make_shared;
+using std::string;
+using std::map;
+
+namespace singa{
+
+class Layer;
+typedef shared_ptr<Layer> SLayer;
+/**
+ * Base layer class.
+ * Children should implement at least Layer::Setup, Layer::ComputeFeature(),
+ * Layer::ComputGradient() functions for backpropagation method;
+ * TODO(wangwei) implement children layers to support contrastive divergence,
+ * The identifier of each layer is the literal string of the class name without
+ * the suffix "Layer", which is used in layer registration and creation.
+ */
+class Layer {
+ public:
+  Layer(){}
+  /**
+   * simply save the proto configuation.
+   * most initializations are done by Setup().
+   * @param layer_proto user defined layer configuration
+   */
+  virtual void Init(const LayerProto &proto);
+  /**
+   * copy layer configuration from the other Layer, and set the shape.
+   */
+  void Init(const Layer& other, const vector<int>& shape);
+  virtual ~Layer(){}
+  /**
+   * Marshal layer properties and data into google protobuf object
+   * (i.e., snapshot).
+   * Parameters are marshalled separately into another object (i.e., model).
+   * @param layer_proto
+   * @param copyData if true marshal data of DArray
+   */
+  virtual void ToProto(LayerProto *layer_proto, bool copyData);
+  /**
+   * Setup layer properties.
+   * Setup the shapes for data and parameters, also setup some properties
+   * based on the layer configuration and connected src layers.
+   * @param srclayers layers connecting to this layer
+   */
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers)=0;
+  /**
+   * \copydoc Setup(const LayerProto&, const vector<SLayer>&)
+   */
+  virtual void Setup();
+  /**
+   * Setup the layer properties except shape.
+   * the shape is already set and passed in to set other properties.
+   * perperties are set according to shapes of itself and connected layers, and
+   * configuration. this should not change the current shape_(
+   * shape check is done outside the function).
+   */
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers)=0;
+  /**
+   * \copybrief SetupAfterPartition(const LayerProto&, const vector<int> &,
+   * const vector<SLayer>& ).
+   */
+  virtual void SetupAfterPartition();
+  /**
+   * Layers that have paramters must overload this function.
+   * @return parameters associated with this layer
+   */
+  virtual vector<shared_ptr<Param>> GetParams(){
+    return vector<shared_ptr<Param>>();
+  }
+  /**
+   * Compute features of this layer based on connected layers.
+   * Implement forward propagation for BP; TODO Implement both postive phase
+   * and negative phase for CD.
+   * @param srclayers layers connecting to this layer
+   */
+  virtual void ComputeFeature(bool training, const vector<SLayer>& srclayers)=0;
+  /**
+   * \copybrief ComputeFeature(const vector<SLayer>& srclayers)
+   */
+  virtual void ComputeFeature(bool training);
+  /**
+   * Compute gradients for parameters and connecting layers.
+   * Implement backward propagation for BP; TODO Calculate gradients for
+   * parameters for CD.
+   * @param srclayers layers connecting to this layer.
+   */
+  virtual void ComputeGradient(const vector<SLayer>& srclayers)=0;
+  /**
+   * \copybrief ComputeGradient(const vector<SLayer>& srclayers)
+   */
+  virtual void ComputeGradient();
+  /**
+   * decide on which dimension to do the partitioning.
+   * @mode kLayer, kData, kNone (no partition)
+   * @return the partition dimension, -1 for no partition
+   */
+  virtual int partition_dimension() const {
+    int ret=0;
+    if(partition_type()==kLayerPartition)
+      ret= 1;
+    else if(partition_type()==kNone)
+      ret= -1;
+    return ret;
+  }
+
+  /**
+   * return connection type between two layers.
+   * Currently support two connections: kOneToOne, and kOneToAll.
+   * kOneToOne indicates the dst neuron depends on only one neuron from src
+   * layer. kOneToAll indicates the dst neuron depends on all neurons from src
+   * layer. TODO support kOneToMany.
+   */
+  virtual ConnectionType connection_type(int k) const {
+    CHECK_LT(k, srclayers_.size());
+    return kOneToOne;
+  }
+  /**
+   * return partition type of this layer.
+   * E.g., kNone, kLayer or kData
+   */
+  virtual PartitionType partition_type() const {
+    return layer_proto_.partition_type();
+  }
+  /**
+   * location id is the execution unit (i.e., thread from the working group) ID.
+   */
+  virtual void set_locationid(int id){
+    layer_proto_.set_locationid(id);
+  }
+  virtual int locationid() const {
+    return layer_proto_.locationid();
+  }
+  /**
+   * partition id is the ID of the layer in the original layer.
+   */
+  virtual void set_partitionid(int id){
+    layer_proto_.set_partitionid(id);
+  }
+  virtual int partitiionid() const {
+    return layer_proto_.partitionid();
+  }
+  virtual void set_name(string name){
+    name_=name;
+    layer_proto_.set_name(name);
+  }
+  virtual const string type() const {
+    return layer_proto_.type();
+  }
+  /**
+   * Return name of this layer
+   */
+  const std::string &name() const {
+    return layer_proto_.name();
+  }
+  const vector<int>& shape(const Layer* layer=nullptr) const{
+    return data(layer).shape();
+  }
+
+  /**
+   * @return a const ref for Blob storing neuron values of this layer for BP
+   */
+  virtual const Blob<float>& data(const Layer* from=nullptr) const {
+    return data_;
+  }
+  virtual Blob<float>* mutable_data(const Layer* from=nullptr){
+    return &data_;
+  }
+
+  virtual const Blob<float>& grad(const Layer* from=nullptr) const {
+    return grad_;
+  }
+  /**
+   * @return a pointer to storing neuron grads of this layer for BP
+   */
+  virtual Blob<float>* mutable_grad(const Layer* from=nullptr) {
+    return &grad_;
+  }
+
+  /**
+   * return LayerS that connected to this layer
+   */
+  virtual const vector< SLayer> srclayers() const {
+    return srclayers_;
+  }
+  /**
+   * return LayerS that this layer connected to
+   */
+  virtual const vector<SLayer> dstlayers() const {
+    return dstlayers_;
+  }
+
+  virtual const int srclayers_size() const {
+    return srclayers_.size();
+  }
+  virtual const int dstlayers_size() const {
+    return dstlayers_.size();
+  }
+  virtual void ClearDstLayers() {
+    dstlayers_.clear();
+  }
+  virtual void ClearSrcLayers() {
+    srclayers_.clear();
+  }
+
+  virtual void AddSrcLayer(SLayer src){
+    srclayers_.push_back(src);
+  }
+  virtual void AddDstLayer(SLayer dst){
+    dstlayers_.push_back(dst);
+  }
+
+  virtual bool is_datalayer() const {
+    return false;
+  }
+  virtual bool is_parserlayer() const {
+    return false;
+  }
+  virtual bool is_losslayer() const {
+    return false;
+  }
+  virtual bool is_bridgesrclayer() const {
+    return false;
+  }
+  virtual bool is_bridgedstlayer() const {
+    return false;
+  }
+protected:
+  string name_;
+  //vector<shared_ptr<SyncedMem>> memblobs_;
+  Blob<float> data_, grad_;
+  // DArray pos_, neg_;//for CD
+  LayerProto layer_proto_;
+  vector<SLayer> srclayers_, dstlayers_;
+};
+
+/**
+ * For sending data to layer on other threads which may resident on other nodes
+ * due to layer/data partition.
+ */
+class BridgeSrcLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void SetupAfterPartition();
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual void ComputeFeature(bool training, const vector<SLayer>& srclayers);
+  virtual void ComputeGradient(const vector<SLayer>& srclayers);
+  virtual bool is_bridgesrclayer() const {
+    return true;
+  }
+
+  virtual void set_ready(bool a) {
+    ready_=a;
+  }
+  virtual bool ready() const {
+    return ready_;
+  }
+ protected:
+  bool ready_;
+};
+/**
+ * For recv data from layer on other threads which may resident on other nodes
+ * due to layer/data partiton
+ */
+class BridgeDstLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void SetupAfterPartition();
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual void ComputeFeature(bool training, const vector<SLayer>& srclayers);
+  virtual void ComputeGradient(const vector<SLayer>& srclayers);
+  virtual bool is_bridgedstlayer() const {
+    return true;
+  }
+  virtual void set_ready(bool a) {
+    ready_=a;
+  }
+  virtual bool ready() const {
+    return ready_;
+  }
+ protected:
+  bool ready_;
+};
+
+/**
+ * Concate src layers on one dimension
+ */
+class ConcateLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void SetupAfterPartition();
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+};
+
+
+/**
+ * base layer for prefetching records from local Shard, HDFS, lmdb, etc.
+ * cannot be partitioned, always returns kNone for partition type.
+ */
+
+class DataLayer: public Layer{
+ public:
+  virtual void ComputeFeature(bool training, const vector<SLayer>& srclayers)=0;
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers)=0;
+  virtual bool is_datalayer() const {
+    return true;
+  }
+  virtual void ComputeGradient(const vector<SLayer>& srclayers){};
+  virtual const vector<Record>& records() const {
+    return records_;
+  }
+  virtual void Setup(){
+    vector<SLayer> dummy;
+    Setup(layer_proto_,dummy);
+    has_set_=true;
+  }
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual void SetupAfterPartition(){
+    if(!has_set_)
+    Setup();
+  }
+  virtual PartitionType partition_type () const {
+    return kNone;
+  }
+
+  virtual int batchsize() const {
+    return layer_proto_.data_param().batchsize();
+  }
+  virtual const Record& sample() const {
+    return sample_;
+  }
+
+  virtual Blob<float>* mutable_data(const Layer* layer=nullptr) {
+    return nullptr;
+  }
+  virtual Blob<float>* mutable_grad(const Layer* layer=nullptr) {
+    return nullptr;
+  }
+  void set_prefetch(bool prefetch){
+    prefetch_=prefetch;
+  }
+
+  virtual void ComputeFeature(bool training) {
+    if(!prefetch_)
+      ComputeFeature(training, srclayers_);
+  }
+
+  virtual void Prefetching(bool training){
+    CHECK(prefetch_);
+    ComputeFeature(training, srclayers_);
+  }
+
+ protected:
+  bool has_set_;
+  bool prefetch_;
+  int random_skip_, batchsize_;
+  Record sample_;
+  vector<Record> records_;
+};
+
+/**
+ * Slice this layer into multiple dst layers on one dimension
+ */
+class SliceLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void SetupAfterPartition();
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+
+  virtual const Blob<float>& data(const Layer* layer=nullptr) const;
+  virtual const Blob<float>& grad(const Layer* layer=nullptr) const;
+  virtual Blob<float>* mutable_data(const Layer* layer=nullptr);
+  virtual Blob<float>* mutable_grad(const Layer* layer=nullptr);
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+
+ protected:
+  int SliceID(const Layer* layer) const;
+  vector<Blob<float>> datavec_, gradvec_;
+};
+
+/**
+ * Replciate this layer into multiple dst layers
+ */
+class SplitLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void SetupAfterPartition();
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+};
+
+/**
+ * Loss layer to calculate loss and other metrics, e.g., precison.
+ */
+class LossLayer: public Layer{
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers)=0;
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers)=0;
+  virtual Blob<float>* mutable_grad(const Layer* layer=nullptr){
+    return nullptr;
+  }
+  virtual const Blob<float>& grad(const Layer* from=nullptr) const {
+    CHECK(false)<<"Loss layer has not gradient blob";
+    return grad_;
+  }
+  virtual bool is_losslayer() const {
+    return true;
+  }
+
+  virtual const Blob<float>& metric() const {
+    return metric_;
+  }
+ protected:
+  Blob<float> metric_;
+};
+
+/**
+ * parse the input records into Blobs.
+ */
+class ParserLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers)=0;
+  /**
+   * Parse records from DataLayer into blob.
+   * This function is called by
+   * ComputeFeature(bool, const vector<SLayer>& srclayers)  or Prefetch(bool).
+   */
+  virtual void ParseRecords(bool training, const vector<Record>& records, Blob<float>* blob)=0;
+  virtual bool is_parserlayer() const {
+    return true;
+  }
+  /**
+   * Dummy function. ParserLayer does not compute gradients.
+   */
+  virtual void ComputeGradient(const vector<SLayer>& srclayers){};
+  virtual void Setup(){
+    Setup(layer_proto_,srclayers_);
+    has_set_=true;
+    ready_=true;
+    prefetch_=false;
+  }
+  virtual void SetupAfterPartition(){
+    if(!has_set_)
+      Setup();
+  }
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){}
+
+  virtual PartitionType partition_type () const{
+    return kNone;
+  }
+  virtual Blob<float>* mutable_grad(const Layer* layer=nullptr) {
+    return nullptr;
+  }
+  virtual const Blob<float>& grad(const Layer* from=nullptr) const {
+    CHECK(false)<<"Parser layer has not gradient blob";
+    return grad_;
+  }
+
+  virtual void ComputeFeature(bool training, const vector<SLayer>& srclayers){
+    if(!prefetch_){
+      DataLayer* datalayer=static_cast<DataLayer*>(srclayers[0].get());
+      ParseRecords(training, datalayer->records(), &data_);
+    }else{
+      std::unique_lock<std::mutex> lck(mtx_);
+      while(!ready_) cv_.wait(lck);
+      data_.CopyFrom(prefetch_data_);
+      ready_=false;
+      cv_.notify_all();
+    }
+  }
+  /**
+   * prefetching is transparent to parsing logics.
+   * users implement parsing logics in ParseRecords
+   * worker/training algorithm calls this function to do prefetching in a
+   * separate thread. Records are in fact parsed into prefetch_data_, and later
+   * copied into data_.
+   */
+  void Prefetching(bool training){
+    std::unique_lock<std::mutex> lck(mtx_);
+    while(ready_) cv_.wait(lck);
+    //data_.Swap(prefetch_data_);
+    DataLayer* datalayer=static_cast<DataLayer*>(srclayers_[0].get());
+    ParseRecords(training, datalayer->records(), &prefetch_data_);
+    ready_=true;
+    cv_.notify_all();
+  }
+
+  /**
+   * must be called before calling ComputeFeature(bool) if Prefetching runs in a
+   * separate thread
+   */
+  void set_prefetch(bool prefetch) {
+    if(prefetch){
+      if(prefetch_data_.count()==0)
+        prefetch_data_.ReshapeLike(data_);
+      ready_=false;
+    }
+    prefetch_=prefetch;
+  }
+
+ private:
+  std::mutex mtx_;
+  std::condition_variable cv_;
+  bool ready_;
+  bool has_set_;
+  bool prefetch_;
+  //!< prefetch_data_ is invisible to layer logics, i.e., parsing.
+  Blob<float> prefetch_data_;
+};
+} // singa
+
+#endif // INCLUDE_BASE_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
new file mode 100644
index 0000000..263d249
--- /dev/null
+++ b/include/neuralnet/layer.h
@@ -0,0 +1,297 @@
+#ifndef INCLUDE_NET_LAYER_H_
+#define INCLUDE_NET_LAYER_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+#include <memory>
+#include <chrono>
+#include <random>
+#include <lmdb.h>
+
+#include "proto/model.pb.h"
+#include "utils/data_shard.h"
+#include "neuralnet/base_layer.h"
+
+
+/**
+ * \file this file includes the declarations neuron layer classes that conduct
+ * the transformation of features.
+ */
+namespace singa {
+
+/**
+ * Convolution layer.
+ */
+class ConvolutionLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  /**
+   * need to reset some properties (e.g., weight matrix) according to
+   * shapes (after partition, e.g., partition is done against channel dimension)
+   */
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  virtual vector<shared_ptr<Param>> GetParams() {
+    return vector<shared_ptr<Param>>{weight_, bias_};
+  }
+  virtual ConnectionType connection_type(int k) const {
+    CHECK_LT(k, srclayers_.size());
+    return kOneToAll;
+  }
+ protected:
+  int kernel_, pad_,  stride_ ;
+  int batchsize_,  channels_, height_,width_;
+  int col_height_, col_width_, conv_height_, conv_width_, num_filters_;
+  shared_ptr<Param> weight_, bias_;
+  Blob<float> col_data_, col_grad_;
+};
+
+class DropoutLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+ protected:
+  // drop probability
+  float pdrop_;
+  /* record which neuron is dropped, required for back propagating gradients,
+   * if mask[i]=0, then the i-th neuron is dropped.
+   */
+  Blob<float> mask_;
+};
+
+/**
+  * fully connected layer
+  */
+class InnerProductLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  /**
+   * need to reset weight matrix in case of LayerPartition
+   */
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+  virtual ConnectionType connection_type(int k) const {
+    CHECK_LT(k, srclayers_.size());
+    return kOneToAll;
+  }
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+  //virtual void ToProto(LayerProto *layer_proto, bool copyData);
+  virtual vector<shared_ptr<Param>> GetParams() {
+    return vector<shared_ptr<Param>>{weight_, bias_};
+  }
+
+ private:
+  //! dimension of the hidden layer
+  int hdim_;
+  //! dimension of the visible layer
+  int vdim_;
+  int batchsize_;
+  shared_ptr<Param> weight_, bias_;
+};
+
+class LabelLayer: public ParserLayer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void ParseRecords(bool training, const vector<Record>& records,
+      Blob<float>* blob);
+};
+
+class LRNLayer: public Layer {
+/**
+ * Local Response Normalization edge
+ * b_i=a_i/x_i^beta
+ * x_i=knorm+alpha*\sum_{j=max(0,i-n/2}^{min(N,i+n/2}(a_j)^2
+ * n is size of local response area.
+ * a_i, the activation (after ReLU) of a neuron convolved with the i-th kernel.
+ * b_i, the neuron after normalization, N is the total num of kernels
+ */
+
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+ protected:
+  //! shape of the bottom layer feature
+  int batchsize_, channels_, height_, width_;
+  //! size local response (neighbor) area
+  int lsize_;
+  //! hyper-parameter
+  float alpha_, beta_, knorm_;
+  Blob<float> norm_;
+};
+
+class MnistImageLayer: public ParserLayer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void ParseRecords(bool training, const vector<Record>& records,
+      Blob<float>* blob);
+
+ protected:
+  // height and width of the image after deformation
+  // kernel size for elastic distortion
+  // n^2 images are processed as a batch for elastic distortion
+  // conv height and conv width
+  // gauss kernel values, displacements, column image and tmp buffer
+  //float* gauss_, *displacementx_, *displacementy_, *colimg_, *tmpimg_;
+  float  gamma_, beta_, sigma_, kernel_, alpha_, norm_a_, norm_b_;
+  int resize_, elastic_freq_;
+};
+
+class PoolingLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+ protected:
+  int kernel_, pad_, stride_;
+  int batchsize_,channels_, height_, width_, pooled_height_, pooled_width_;
+  PoolingProto_PoolMethod pool_;
+};
+
+class ReLULayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+};
+
+
+class SoftmaxLossLayer: public LossLayer {
+  /*
+   * connected from the label layer and the last fc layer
+   */
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+  /**
+   * softmax is not recommendeded for partition because it requires the whole
+   * src layer for normalization.
+   */
+  virtual PartitionType partition_type() const {
+    if(layer_proto_.partition_type()==kLayerPartition)
+      return kNone;
+    else
+      return layer_proto_.partition_type();
+  }
+  virtual ConnectionType connection_type(int k) const {
+    CHECK_LT(k, srclayers_.size());
+    return kOneToAll;
+  }
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+ private:
+  int batchsize_;
+  int dim_;
+  float scale_;
+  int topk_;
+};
+
+class RGBImageLayer: public ParserLayer {
+ public:
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  virtual void ParseRecords(bool training, const vector<Record>& records,
+      Blob<float>* blob);
+
+ private:
+  float scale_;
+  int cropsize_;
+  bool mirror_;
+  Blob<float> mean_;
+};
+
+class ShardDataLayer: public DataLayer{
+ public:
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){};
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+ private:
+  shared_ptr<DataShard> shard_;
+};
+class LMDBDataLayer: public DataLayer{
+ public:
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){};
+  virtual void Setup(const LayerProto& proto, const vector<SLayer>& srclayers);
+  void ConvertDatumToSingleLableImageRecord(const Datum& datum,
+    SingleLabelImageRecord* record);
+
+ private:
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+  MDB_txn* mdb_txn_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+};
+
+/**
+ * This layer apply Tan function to neuron activations.
+ * f(x)=A tanh(Bx)
+ * f'(x)=B/A (A*A-f(x)*f(x))
+ */
+class TanhLayer: public Layer {
+ public:
+  virtual void Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers);
+
+  virtual void SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers);
+
+
+  virtual void ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers);
+  virtual void ComputeGradient(const vector<shared_ptr<Layer>>& srclayers);
+ private:
+  float outer_scale_, inner_scale_;
+};
+
+
+}  // namespace singa
+
+#endif  // INCLUDE_NET_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/neuralnet/neuralnet.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/neuralnet.h b/include/neuralnet/neuralnet.h
new file mode 100644
index 0000000..586a470
--- /dev/null
+++ b/include/neuralnet/neuralnet.h
@@ -0,0 +1,156 @@
+#ifndef INCLUDE_NET_NET_H_
+#define INCLUDE_NET_NET_H_
+
+#include <glog/logging.h>
+#include <vector>
+#include <map>
+#include <memory>
+
+#include "proto/model.pb.h"
+#include "neuralnet/layer.h"
+#include "utils/factory.h"
+#include "utils/graph.h"
+
+using std::vector;
+using std::string;
+using std::map;
+using std::shared_ptr;
+namespace singa {
+/**
+ * The neural network is constructed from user configured layers through google
+ * protocol buffer. TODO support constructing neural network by adding layers
+ * explicitly. E.g., users create layers and connect them manually in the code.
+ *
+ * Some layers, e.g., SplitLayer and BridgeSrcLayer/BridgeDstLayer will be added
+ * implicitly to partition the neural network.
+ */
+class NeuralNet {
+ public:
+  /**
+   * Register Layers
+   */
+  static void RegisterLayers();
+  /**
+   * Setup the neural network for training, test or validation.
+   *
+   * Parameters for test/validation net can share those from training after
+   * setup (done outside of this funcion).
+   *
+   * @param np proto for the neural network.
+   */
+  static shared_ptr<NeuralNet> SetupNeuralNet(const NetProto& np, Phase phase);
+
+ public:
+  /**
+   * construct the net structure from protocol buffer.
+   */
+  NeuralNet(NetProto net_proto, int group_size=1);
+  /**
+   * construct a json string representing the neuralnet graph.
+   * The json string can be used by other graph engine to draw a figure for
+   * displaying the neuralnet structure.
+   */
+  std::string ToString();
+  /**
+   * Print Norm1 of data and grad of each Layer and parameter.
+   * @param net, neural network
+   */
+  string DebugInfo();
+
+  /**
+   * to display the adjacency layers
+   */
+  std::string ToAdjacency();
+  /**
+   * Add layer explicitly used in manually programming/constructing neural net.
+   */
+  void AddLayer(const LayerProto &layer_proto){};
+  /**
+   * Add layer explicitly used in manually programming/constructing neural net.
+   */
+  void AddLayer(const Layer* layer){};
+  /**
+   * share weights from other neuralnet
+   */
+  void ShareParams(shared_ptr<NeuralNet> other,int flag);
+  void ToProto(NetProto *net_proto, bool copyData=false);
+  const std::vector<shared_ptr<Layer>>& layers() {
+    return layers_;
+  }
+  /**
+   * return ParserLayer of the neuralnet.
+   */
+  const std::vector<ParserLayer*>& parserlayers() {
+    if(parserlayers_.size()==0){
+      for(auto& layer: layers_)
+        if(layer->is_parserlayer())
+          parserlayers_.push_back(static_cast<ParserLayer*>(layer.get()));
+    }
+    return parserlayers_;
+  }
+  const std::vector<LossLayer*>& losslayers() {
+    if(losslayers_.size()==0){
+      for(auto& layer: layers_)
+        if(layer->is_losslayer())
+          losslayers_.push_back(static_cast<LossLayer*>(layer.get()));
+    }
+    return losslayers_;
+  }
+  const std::vector<DataLayer*>& datalayers() {
+    if(datalayers_.size()==0){
+      for(auto& layer: layers_)
+        if(layer->is_datalayer())
+          datalayers_.push_back(static_cast<DataLayer*>(layer.get()));
+    }
+    return datalayers_;
+  }
+  const std::vector<shared_ptr<Param>> &params()const {
+    return params_;
+  }
+  shared_ptr<Layer> name2layer(string name){
+    if (name2layer_.find(name)!=name2layer_.end())
+      return name2layer_[name];
+    else return nullptr;
+  }
+
+  shared_ptr<Param> paramid2param(int id) {
+    if(paramid2param_.size()==0){
+      for(auto& layer: layers_){
+        for(shared_ptr<Param> p: layer->GetParams()){
+          paramid2param_[p->id()]=p;
+        }
+      }
+    }
+    return paramid2param_[id];
+  }
+
+ protected:
+  void ConstructNeuralNet(const NetProto &net_proto);
+  void PartitionNeuralNet();
+  map<string, shared_ptr<Layer>> GetNameToLayer(
+    const vector<shared_ptr<Layer>>& layers);
+  Graph CreatePartitonedGraph(const vector<shared_ptr<Layer>>& layers,
+    const map<string, shared_ptr<Layer>>& name2layer);
+
+  /**
+   * Partition each layer according its partition type and dimension.
+   * @param layers original unpartitioned layers
+   */
+  map<string, vector<shared_ptr<Layer>>> PartitionLayers(
+      const vector<shared_ptr<Layer>>& layers);
+
+ protected:
+  vector<shared_ptr<Layer>> layers_;
+  vector<ParserLayer*> parserlayers_;
+  vector<LossLayer*> losslayers_;
+  vector<DataLayer*> datalayers_;
+  vector<shared_ptr<Param>> params_;
+  map<string, shared_ptr<Layer>> name2layer_;
+  map<int, shared_ptr<Param>> paramid2param_;
+
+  map<string, LayerProto> name2layerproto_;
+  int group_size_;
+  Graph graph_;
+};
+}  // namespace singa
+#endif  // INCLUDE_NET_NET_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/trainer/pm_server.h
----------------------------------------------------------------------
diff --git a/include/trainer/pm_server.h b/include/trainer/pm_server.h
new file mode 100644
index 0000000..b759844
--- /dev/null
+++ b/include/trainer/pm_server.h
@@ -0,0 +1,91 @@
+#ifndef INCLUDE_TRAINER_PM_SERVER_H_
+#define INCLUDE_TRAINER_PM_SERVER_H_
+
+#include <czmq.h>
+#include <memory>
+#include <vector>
+#include <map>
+#include <string.h>
+#include "proto/model.pb.h"
+#include "utils/updater.h"
+#include "utils/param.h"
+#include "communication/msg.h"
+#include "communication/socket.h"
+using std::vector;
+using std::string;
+using std::shared_ptr;
+
+namespace singa{
+
+/**
+ * Parameter manager at the server side.
+ *
+ * Repsond to worker's get/put/udpate request, and periodically syncing with
+ * other servers.
+ *
+ * Normally, the PMServer creates a response message for each request which
+ * will be sent back to the one who issued the request. However, if the request
+ * are not processed successfully, the original message will be returned. The
+ * sever does not know the returned message (response or the original message),
+ * it just sends it to the router. The router will decide to re-send the
+ * request to the server or send it to the worker.
+ *
+ */
+class PMServer{
+public:
+  typedef std::map<int, shared_ptr<Param>> ParamShard;
+
+	void Setup(int group_id, int server_id, shared_ptr<ParamShard> shard,
+       const UpdaterProto& proto);
+
+	~PMServer();
+
+	/**
+	 * Process GET request.
+   *
+   * @return the orignal message or response message
+   */
+	virtual Msg* HandleGet(Msg** msg);
+
+	/**
+	 * Process Update request.
+   *
+   * @return the orignal message or response message
+   */
+	virtual Msg* HandleUpdate(Msg** msg);
+
+	/**
+	 * Process PUT request.
+   *
+   * @return the original message or response message. If we don't want need to
+   * acknowledge the put request, then return nullptr.
+	 */
+	virtual Msg* HandlePut(Msg **msg);
+
+	/**
+   * TODO Process SYNC request.
+	 */
+	virtual Msg* HandleSyncRequest(Msg** msg);
+
+	/**
+   * TODO Process SYNC response.
+	 */
+	virtual int HandleSyncResponse(Msg** msg);
+
+  /**
+   * Scheduler for synchronizing server groups.
+   *
+   * TODO implement the Caffe's synchronization scheduler for data parallelism
+   */
+  virtual bool SyncNow();
+
+ protected:
+  int group_id_, server_id_;
+  shared_ptr<ParamShard> shard_;
+  shared_ptr<Dealer> dealer_;
+  shared_ptr<Updater> updater_;
+};
+
+} // namespace singa
+
+#endif // INCLUDE_TRAINER_PM_SERVER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/trainer/pm_worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/pm_worker.h b/include/trainer/pm_worker.h
new file mode 100644
index 0000000..198f5bd
--- /dev/null
+++ b/include/trainer/pm_worker.h
@@ -0,0 +1,171 @@
+#ifndef INCLUDE_TRAINER_PM_WORKER_H_
+#define INCLUDE_TRAINER_PM_WORKER_H_
+
+#include <memory>
+#include <vector>
+#include <map>
+#include <string>
+#include <atomic>
+#include "utils/param.h"
+#include "communication/msg.h"
+
+using std::string;
+using std::vector;
+using std::shared_ptr;
+using std::map;
+
+namespace singa {
+
+/**
+ * Counters used to construct a parameter shard.
+ *
+ * For each worker group:
+ *   Every unique Param object is associated with a ParamCounter object whose
+ *   param field points the to Param object itself.
+ *
+ *   Param objects sharing the same values (due to data parallelism) are
+ *   associated with the same ParamCounter whose param field also shares the
+ *   same values.
+ *
+ *   Usage: we need to aggregate gradients from all workers for the shared
+ *   parameters before sending the update request. The nUpdate counter counts
+ *   the number.
+ *
+ * TODO test with different physical architectures.
+ */
+class ParamCounter{
+  public:
+  ParamCounter(shared_ptr<Param> p,int local, int owner):
+    nUpdate(0), nGet(0), nPut(0), nCollect(0), nLocal(local), nTotal(0),
+    owner_procs(owner), param(p){}
+
+  /**
+   * Associate the counter to a Param object.
+   *
+   * @param p
+   * @param local 1 if this Param object is used by workers in this procs, 0
+   *  otherwise
+   * @param owner the procs id of the worker who ownes this Param object
+   */
+  void AddParam(shared_ptr<Param> p, int local, int owner){
+    nLocal+=local;
+    nTotal+=1;
+    if(owner_procs>-1)
+      owner_procs=owner;
+    if(nLocal>1){
+      // TODO copy p->param;
+    }
+  }
+  std::atomic<int> nUpdate, nGet, nPut, nCollect; //!< all counters are atomic
+
+  int nLocal; //!< # local workers uses the shared parameter
+  int nTotal; //!< # total workers uses the shared parameter
+  int owner_procs; //!< the procs id of the worker that owns the parameter
+  shared_ptr<Param> param;
+};
+
+
+/**
+ * Parameter manager at the worker side.
+ */
+class PMWorker{
+public:
+  /**
+   * Workers from the same group resident in the same process share the same
+   * ParamShard which contains ParamCounters for Param objects used/updated by
+   * these worekrs. Shared Param objects are associated with the same
+   * ParamCounter.
+   */
+  typedef std::map<int, shared_ptr<ParamCounter>> ParamShard;
+
+
+	void Setup(int group_id, int worker_id, shared_ptr<ParamShard> shard);
+
+  void set_id(int group_id, int worker_id){
+    group_id_=group_id;
+    worker_id_=worker_id;
+  }
+
+  /**
+   * @return server id where the parameter is maintained.
+   */
+  virtual int Sharding(int param_id);
+
+	/**
+	 * Generate a request message to Get the parameter object.
+	 */
+	virtual Msg* Get(shared_ptr<Param> param, int step);
+  virtual Msg* Get(Msg** msg);
+
+	/**
+	 * Generate a request message to Update the parameter object.
+	 */
+	virtual Msg* Update(shared_ptr<Param> param, int step);
+  virtual Msg* Update(Msg** msg);
+
+	/**
+	 * Collect a Param object returned from server.
+	 */
+	virtual Msg* Collect(Msg**);
+
+	/**
+	 * Generate a request message to Put the parameter object.
+	 */
+	virtual Msg* Put(shared_ptr<Param> param, int step);
+  virtual Msg* Put(Msg** msg);
+
+ protected:
+  int group_id_, worker_id_;
+  shared_ptr<ParamShard> shard_;
+};
+
+/**
+ * Testing worker functionality.The main thread reads the config file and set up the socket.
+ *
+ * Create the shared ParamShard, then starts worker thread which basically carries out the work.
+ * Each thread creates a PMClient object.
+ *
+ * The main thread then enter the loops to forward messages.
+ *
+ * Requests from the worker thread is prepend the paramId, which is stripped by the main thread
+ * before forwarding to the correct server.
+ *
+ * The 1st thread in Client 0 populates the servers with data (PUT request). Wait
+ * for a while before starting the client thread (which does get/update
+ * continuously).
+class SingaClient {
+public:
+	SingaClient(int worker_id, Topology &topology, vector<string> &hosts);
+	void StartClient();
+
+	int id() {
+		return id_;
+	}
+	ParamShard *param_shard() {
+		return param_shard_;
+	}
+	char *backend_endpoint() {
+		return backend_endpoint_;
+	}
+
+private:
+	int id_, local_id_, group_id_;
+	char backend_endpoint_[256];
+	vector<char*> neighbors_;
+	ParamShard *param_shard_;
+
+	int param_to_server_id(int paramId);//< mapping paramId to server ID
+};
+
+//Zthread function for the worker thread, in the global namespace.
+//Basically a loop of: compute, get, update, compute, etc.
+void ClientThread(void *args, zctx_t *ctx, void *pipe);
+
+vector<Param*> gen_random_params();
+void test_get(PMClient *client);
+void test_update(PMClient *client, vector<Param*> params);
+void test_collect(PMClient *client);
+ */
+
+} // namespace singa
+#endif // INCLUDE_TRAINER_PM_WORKER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/trainer/server.h
----------------------------------------------------------------------
diff --git a/include/trainer/server.h b/include/trainer/server.h
new file mode 100644
index 0000000..d113c7d
--- /dev/null
+++ b/include/trainer/server.h
@@ -0,0 +1,22 @@
+#ifndef INCLUDE_TRAINER_SERVER_H_
+#define INCLUDE_TRAINER_SERVER_H_
+#include <memory>
+#include "trainer/pm_server.h"
+#include "communication/socket.h"
+
+using std::shared_ptr;
+namespace singa {
+class Server{
+ public:
+  Server(int group_id, int server_id);
+  void Setup(const UpdaterProto& proto, shared_ptr<PMServer::ParamShard> shard,
+    shared_ptr<Dealer> dealer);
+  void Run();
+
+ protected:
+  int group_id_, server_id_;
+  shared_ptr<PMServer> pmserver_;
+  shared_ptr<Dealer> dealer_;
+};
+} /* Server */
+#endif //INCLUDE_TRAINER_SERVER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/trainer/trainer.h
----------------------------------------------------------------------
diff --git a/include/trainer/trainer.h b/include/trainer/trainer.h
new file mode 100644
index 0000000..34d95f1
--- /dev/null
+++ b/include/trainer/trainer.h
@@ -0,0 +1,50 @@
+#ifndef INCLUDE_TRAINER_TRAINER_H_
+#define INCLUDE_TRAINER_TRAINER_H_
+#include "proto/cluster.pb.h"
+#include "proto/model.pb.h"
+#include "utils/updater.h"
+#include "utils/param.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "neuralnet/neuralnet.h"
+#include "trainer/pm_worker.h"
+#include "trainer/pm_server.h"
+#include "trainer/worker.h"
+#include "trainer/server.h"
+
+namespace singa {
+/**
+ * Every running process has a training object which launches one or more
+ * worker (and server) threads.
+ *
+ * The main thread runs a loop to forward messages between workers and servers.
+ */
+class Trainer{
+ public:
+  /**
+   * Start the training in one process
+   *
+   * @param modelproto
+   * @param clusterproto
+   */
+  void Start(const ModelProto& modelproto, const ClusterProto& clusterproto,
+    int procs_id);
+
+  // TODO add Resume() function to continue training from a previously stopped
+  // point.
+
+ protected:
+  void Run();
+  /**
+   * Register default implementations for all base classes used in the system,
+   * e.g., the Updater, BaseMsg, etc.
+   *
+   * All built-in layer implementations are
+   * registered here.
+   * For other base classes, use its base class name (string) as the key and the
+   * implementation class as the value, e.g., <"Updater" SGDUpdater>.
+   */
+  void RegisterDefaultClasses(const singa::ModelProto& proto);
+};
+} /* singa */
+#endif // INCLUDE_TRAINER_TRAINER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
new file mode 100644
index 0000000..609e7dc
--- /dev/null
+++ b/include/trainer/worker.h
@@ -0,0 +1,218 @@
+#ifndef INCLUDE_TRAINER_WORKER_H_
+#define INCLUDE_TRAINER_WORKER_H_
+#include <map>
+#include <exception>
+#include "neuralnet/neuralnet.h"
+#include "proto/model.pb.h"
+#include "trainer/pm_worker.h"
+#include "utils/cluster.h"
+#include "communication/socket.h"
+#include "communication/msg.h"
+
+namespace singa {
+/**
+ * Collecting metrics, like accuracy, loss, etc.
+ */
+class Performance{
+ public:
+  /**
+   * Collect from LossLayer of net.
+   */
+  explicit Performance(shared_ptr<NeuralNet> net);
+  /**
+   * aggregate metrics from LossLayerS
+   */
+  void Update();
+  void Reset();
+  string ToString();
+ private:
+  vector<string> name_;
+  shared_ptr<NeuralNet> net_;
+  vector<vector<float>> metric_;
+  int counter_; //!< inc by 1 for every Update
+};
+
+/**
+ * The Worker class which runs the training algorithm.
+ * The first worker group will initialize parameters of the Net,
+ * and put them into the distributed memory/table.
+ */
+class Worker {
+ public:
+  Worker(int group_id, int worker_id);
+  ~Worker(){}
+  void Setup(const ModelProto& model, shared_ptr<NeuralNet> train_net,
+      shared_ptr<PMWorker::ParamShard> shard, shared_ptr<Dealer> layer_dealer,
+    shared_ptr<Dealer> param_dealer);
+  void set_test_net(shared_ptr<NeuralNet> test_net){
+    test_net_=test_net;
+  }
+  void set_validation_net(shared_ptr<NeuralNet> val_net){
+    validation_net_=val_net;
+  }
+
+  int Put(shared_ptr<Param> param, int step);
+  int Get(shared_ptr<Param> param, int step);
+  int Update(shared_ptr<Param> param, int step);
+  int Collect(shared_ptr<Param> param, int step);
+  /**
+    * check validation/test firstly, then TrainOneBatch
+    * Performance collects performance for the whole neuralnet.
+    * Hence, no need to collect performance in every thread.
+    * Only the main thread will pass none null perf.
+    */
+  void RunOneBatch(int step, Performance* perf=nullptr);
+  /**
+    * Train one mini-batch.
+    * Test/Validation is done before training.
+    */
+  virtual void TrainOneBatch(int step)=0;
+  /**
+   * Test/validate one mini-batch.
+   */
+  virtual void TestOneBatch(shared_ptr<NeuralNet> net, int step, Phase phase)=0;
+  /**
+    * Test the perforance of the learned model on validation or test dataset.
+    * Test is done by the first group.
+    * @param net, neural network
+    * @param phase kValidation or kTest.
+    */
+  void Test(shared_ptr<NeuralNet> net, int nsteps, bool dispperf);
+
+  /**
+    * Main function of Worker.
+    * 1. Train the neuralnet step by step, test/validation is done periodically.
+    * 2. TODO Communicate with others, e.g., zookeeper, after every step.
+    */
+  virtual void Run();
+
+
+  /**
+   * Pull data from layers resident on other nodes due to Model Partition.
+  void Pull(zsock_t* pull, shared_ptr<NeuralNet> net);
+   */
+
+  /**
+   * Check is it time to display training info, e.g., loss and precison.
+   */
+  const bool DisplayNow(const int step) const {
+    return (modelproto_.display_frequency() > 0
+        && step >= modelproto_.display_after_steps()
+        && ((step - modelproto_.display_after_steps())
+          % modelproto_.display_frequency() == 0));
+  }
+
+  const bool DisplayDebugInfo(const int step) const {
+    return DisplayNow(step)&&modelproto_.debug()&&group_id_==0;
+  }
+
+  /**
+   * return true if the stop condition is satisfied, e.g., the maximum number
+   * of steps have been reached.
+   */
+  const bool StopNow(const int step) const{
+    return (step >= modelproto_.train_steps());
+  }
+  /**
+   * Check is it time to do checkpoint.
+   * @param step the ::Train() has been called this num times.
+   */
+  const bool CheckpointNow(const int step) const{
+    return (group_id_==0
+        && modelproto_.checkpoint_frequency() > 0
+        && step >= modelproto_.checkpoint_after_steps()
+        && ((step - modelproto_.checkpoint_after_steps())
+          % modelproto_.checkpoint_frequency() == 0));
+  }
+  /**
+   * Check is it time to do test.
+   * @param step the ::Train() has been called this num times.
+   */
+  const bool TestNow(const int step) const{
+    return (group_id_==0
+        && modelproto_.test_frequency() > 0
+        && step >= modelproto_.test_after_steps()
+        && ((step - modelproto_.test_after_steps())
+          % modelproto_.test_frequency() == 0));
+  }
+  /**
+   * Check is it time to do validation.
+   * @param step the ::Train() has been called step times.
+   */
+  const bool ValidateNow(const int step) {
+    return (group_id_==0
+        && modelproto_.validation_frequency() > 0
+        && step >= modelproto_.validation_after_steps()
+        && ((step - modelproto_.validation_after_steps())
+          % modelproto_.validation_frequency() == 0));
+  }
+
+
+  /**
+   * start training from scratch.
+   * setup training/test/validation neuralnets, then call Run().
+  void Start(ModelProto model);
+   */
+  /**
+   * TODO Resume from snapshot
+  void Resume();
+   */
+  void ReceiveBlobs(shared_ptr<NeuralNet> net);
+  void SendBlob();
+ protected:
+  int group_id_, worker_id_;
+  int step_;
+  ModelProto modelproto_;
+  shared_ptr<PMWorker> pmworker_;
+  shared_ptr<NeuralNet> train_net_, test_net_, validation_net_;
+  shared_ptr<Dealer> layer_dealer_, param_dealer_;
+  Poller layer_poller_, param_poller_;
+};
+
+class WorkerException: public std::exception{
+ public:
+  const char* what() throw(){
+    return "Worker Exception";
+  }
+};
+
+
+class BPWorker: public Worker{
+ public:
+  ~BPWorker(){}
+  BPWorker(int group_id, int worker_id):Worker(group_id, worker_id){}
+  virtual void TrainOneBatch(int step);
+  virtual void TestOneBatch(shared_ptr<NeuralNet> net, int step, Phase phase);
+  void Forward(shared_ptr<NeuralNet> net, int step, bool training);
+  void Backward(shared_ptr<NeuralNet> net, int step);
+    /**
+   * Profiling the time cost of training one batch.
+  string TimerInfo(){
+    char buf[1024];
+    float ticks=ticks_*1000;
+    float tf=tForward_/ticks, tb=tBackward_/ticks,
+          td=tSyncData_/ticks, tp=tSyncParam_/ticks;
+    float total=tf+tb+td+tp;
+    sprintf(buf,
+        "Total\t%6.2f\tforward\t%6.2f\tbackward\t%6.2f\t"
+        // syncdata\t%6.2f\tsyncparam\t%6.2f\n"
+        , total,tf,tb);
+    float gensync=Param::worker_gen_sync/ticks;
+    float handlesync=Param::worker_handle_sync/ticks;
+    sprintf(buf+strlen(buf),
+        "worker_gen_sync\t%6.2f\tworker_handle_sync\t%6.2f\n",
+        gensync, handlesync);
+    Param::worker_gen_sync=0;
+    Param::worker_handle_sync=0;
+    tForward_=0;
+    tBackward_=0;
+    tSyncData_=0;
+    tSyncData_=0;
+    ticks_=0;
+    return string(buf);
+  }
+   */
+};
+}  // namespace singa
+
+#endif  // INCLUDE_TRAINER_WORKER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/utils/blob.h b/include/utils/blob.h
new file mode 100644
index 0000000..08068eb
--- /dev/null
+++ b/include/utils/blob.h
@@ -0,0 +1,166 @@
+/**
+ * The code is adapted from that of Caffe whose license is attached.
+ *
+ * COPYRIGHT
+ * All contributions by the University of California:
+ * Copyright (c) 2014, The Regents of the University of California (Regents)
+ * All rights reserved.
+ * All other contributions:
+ * Copyright (c) 2014, the respective contributors
+ * All rights reserved.
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * LICENSE
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * CONTRIBUTION AGREEMENT
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ */
+#ifndef INCLUDE_UTILS_BLOB_
+#define INCLUDE_UTILS_BLOB_
+#include <memory>
+#include <vector>
+#include <glog/logging.h>
+#include "proto/model.pb.h"
+using std::shared_ptr;
+using std::vector;
+
+#define NOT_IMPLEMENTED LOG(FATAL) << "Not implemented function"
+inline void MallocHost(void** ptr, size_t size) {
+  *ptr = malloc(size);
+}
+
+inline void FreeHost(void* ptr) {
+  free(ptr);
+}
+
+/**
+ * @brief Manages memory allocation and synchronization between the host (CPU)
+ *        and device (GPU).
+ *
+ * TODO(dox): more thorough description.
+ */
+class SyncedMemory {
+ public:
+  SyncedMemory()
+      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+        own_cpu_data_(false) {}
+  explicit SyncedMemory(size_t size)
+      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+        own_cpu_data_(false) {}
+  ~SyncedMemory();
+  const void* cpu_data();
+  void set_cpu_data(void* data);
+  const void* gpu_data();
+  void* mutable_cpu_data();
+  void* mutable_gpu_data();
+  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
+  SyncedHead head() { return head_; }
+  size_t size() { return size_; }
+
+ private:
+  void to_cpu();
+  void to_gpu();
+  void* cpu_ptr_;
+  void* gpu_ptr_;
+  size_t size_;
+  SyncedHead head_;
+  bool own_cpu_data_;
+
+};  // class SyncedMemory
+
+
+template <typename Dtype>
+class Blob {
+ public:
+  Blob(): count_(0), capacity_(0) {}
+  Blob(const vector<int>&shape);
+  /**
+   * @brief Change the dimensions of the blob, allocating new memory if
+   *        necessary.
+   *
+   * This function can be called both to create an initial allocation
+   * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
+   * or Layer::Forward. When changing the size of blob, memory will only be
+   * reallocated if sufficient memory does not already exist, and excess memory
+   * will never be freed.
+   *
+   * Note that reshaping an input blob and immediately calling Net::Backward is
+   * an error; either Net::Forward or Net::Reshape need to be called to
+   * propagate the new input shape to higher layers.
+   */
+  void Reshape(const vector<int>& shape);
+  void ReshapeLike(const Blob& other);
+  const vector<int>& shape() const{
+    return shape_;
+  }
+  inline int count() const { return count_; }
+  /**
+   * @brief Copy from a source Blob.
+   *
+   * @param source the Blob to copy from
+   * @param reshape if false, require this Blob to be pre-shaped to the shape
+   *        of other (and die otherwise); if true, Reshape this Blob to other's
+   *        shape if necessary
+   */
+  void CopyFrom(const Blob<Dtype>& source, bool reshape = false);
+
+  inline const shared_ptr<SyncedMemory>& data() const {
+    CHECK(data_);
+    return data_;
+  }
+
+  const Dtype* cpu_data() const;
+  void set_cpu_data(Dtype* data);
+  const Dtype* gpu_data() const;
+  Dtype* mutable_cpu_data();
+  Dtype* mutable_gpu_data();
+  /*
+  void FromProto(const BlobProto& proto);
+  */
+  void ToProto(singa::BlobProto* proto) const;
+
+  /// @brief Compute the sum of absolute values (L1 norm) of the data.
+  Dtype asum_data() const;
+  Dtype sum_data() const;
+
+  /**
+   * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
+   *        data_ of Blob other -- useful in Layer&s which simply perform a copy
+   *        in their Forward pass.
+   *
+   * This deallocates the SyncedMemory holding this Blob's data_, as
+   * shared_ptr calls its destructor when reset with the "=" operator.
+   */
+  void ShareData(const Blob& other);
+  void Swap(Blob& other);
+  shared_ptr<SyncedMemory> data_;
+ protected:
+  vector<int> shape_;
+  int count_;
+  int capacity_;
+};  // class Blob
+
+#endif // INCLUDE_UTILS_BLOB_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/cluster.h
----------------------------------------------------------------------
diff --git a/include/utils/cluster.h b/include/utils/cluster.h
new file mode 100644
index 0000000..4812987
--- /dev/null
+++ b/include/utils/cluster.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDE_UTILS_CLUSTER_H_
+#define INCLUDE_UTILS_CLUSTER_H_
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <memory>
+#include <vector>
+#include "proto/cluster.pb.h"
+
+using std::shared_ptr;
+using std::string;
+using std::vector;
+
+namespace singa {
+
+/**
+ * Cluster is a singleton object, which provides cluster configuations,
+ * e.g., the topology of the cluster.
+ * All IDs start from 0.
+ */
+class Cluster {
+ public:
+  static shared_ptr<Cluster> Get();
+  static shared_ptr<Cluster> Get(const ClusterProto& cluster, int procs_id);
+
+  const int nserver_groups()const{ return cluster_.nserver_groups(); }
+  const int nworker_groups()const { return cluster_.nworker_groups(); }
+  int nworkers_per_group()const {return cluster_.nworkers_per_group();}
+  int nservers_per_group()const {return cluster_.nservers_per_group();}
+  int nworkers_per_procs()const{return cluster_.nworkers_per_procs();}
+  int nservers_per_procs()const{return cluster_.nservers_per_procs();}
+  int nworker_groups_per_server_group() const {
+    return cluster_.nworker_groups()/cluster_.nserver_groups();
+  }
+
+  /**
+   * @return true if the calling procs has server threads, otherwise false
+   */
+  bool has_server()const {
+    if(server_worker_separate()){
+      CHECK_LT(procs_id_, nprocs());
+      return procs_id_>=nworker_procs();
+    }else
+      return procs_id_<nserver_procs();
+  }
+  /**
+   * @return true if the calling procs has worker threads.
+   */
+  bool has_worker()const {
+    if(server_worker_separate()){
+      return procs_id_<nworker_procs();
+    }else
+      return procs_id_<nprocs();
+  }
+  /**
+   * @return global procs id, which starts from 0.
+   */
+  int procs_id()const {return procs_id_;}
+  bool server_worker_separate() const {
+    return cluster_.server_worker_separate();
+  }
+  int nworker_procs() const {
+    return nworker_groups()*nworkers_per_group()/nworkers_per_procs();
+  }
+  int nserver_procs() const {
+    return nserver_groups()*nservers_per_group()/nservers_per_procs();
+  }
+  int nprocs() const {
+    return cluster_.nprocs();
+  }
+
+  const string endpoint() const {
+    return endpoint(procs_id());
+  }
+  /**
+   * @return endpoint of the router of a procs with the specified id
+   */
+  const string endpoint(int procs_id) const {
+    CHECK_LT(procs_id, nprocs());
+    CHECK_GE(procs_id, 0);
+    return endpoints_.at(procs_id);
+  }
+  const string workspace() {return cluster_.workspace();}
+  const string vis_folder(){
+    return cluster_.workspace()+"/visualization";
+  }
+  const string log_folder(){
+    if(cluster_.has_log_dir()){
+      return cluster_.workspace()+"log";
+    }else
+      return "";
+  }
+
+  const int stub_timeout() const {
+    return cluster_.stub_timeout();
+  }
+  const int worker_timeout() const {
+    return cluster_.worker_timeout();
+  }
+  const int server_timeout() const {
+    return cluster_.server_timeout();
+  }
+
+  /**
+   * bandwidth MB/s
+  float bandwidth() const {
+    return cluster_.bandwidth();
+  }
+   */
+
+ private:
+  Cluster(const ClusterProto &cluster, int procs_id) ;
+  void SetupFolders(const ClusterProto &cluster);
+
+ private:
+  int procs_id_;
+  std::vector<std::string> endpoints_;
+  // cluster config proto
+  ClusterProto cluster_;
+  // make this class a singlton
+  static shared_ptr<Cluster> instance_;
+};
+}  // namespace singa
+
+#endif  // INCLUDE_UTILS_CLUSTER_H_


[10/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest.h
----------------------------------------------------------------------
diff --git a/include/gtest/gtest.h b/include/gtest/gtest.h
new file mode 100644
index 0000000..4f3804f
--- /dev/null
+++ b/include/gtest/gtest.h
@@ -0,0 +1,20061 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  They are subject to change without notice.  DO NOT USE
+// THEM IN USER CODE.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// The user can define the following macros in the build script to
+// control Google Test's behavior.  If the user doesn't define a macro
+// in this list, Google Test will define it.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// This header defines the following utilities:
+//
+// Macros indicating the current platform (defined to 1 if compiled on
+// the given platform; otherwise undefined):
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//       GTEST_OS_IOS_SIMULATOR - iOS simulator
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// Note that it is possible that none of the GTEST_OS_* macros are defined.
+//
+// Macros indicating available Google Test features (defined to 1 if
+// the corresponding feature is supported; otherwise undefined):
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                  - synchronization primitives.
+//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
+//                         synchronization primitives have real implementations
+//                         and Google Test is thread-safe; or 0 otherwise.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_FLAG()       - references a flag.
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+#  if TARGET_IPHONE_SIMULATOR
+#   define GTEST_OS_IOS_SIMULATOR 1
+#  endif
+# endif
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if !GTEST_OS_WINDOWS
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#elif !GTEST_OS_WINDOWS_MOBILE
+# include <direct.h>
+# include <io.h>
+#endif
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_HAS_POSIX_RE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we assume pthreads support is
+// available on Linux and Mac.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tr1/tuple.
+#if GTEST_HAS_TR1_TUPLE
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX)
+# define GTEST_HAS_DEATH_TEST 1
+# include <vector>  // NOLINT
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#else
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+
+#endif  // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+namespace testing {
+
+class Message;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                         content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {};
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  if (false) {
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+extern ::std::vector<testing::internal::string> g_argvs;
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+
+#if GTEST_HAS_PTHREAD
+
+// Sleeps for (roughly) n milli-seconds.  This function is only for
+// testing Google Test's own constructs.  Don't use it in user tests,
+// either directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void (*UserThreadFunc)(T);
+
+  ThreadWithParam(
+      UserThreadFunc func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  const UserThreadFunc func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms. They
+// are used in conjunction with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
+//                            // of the current scope.
+//
+// MutexBase implements behavior for both statically and dynamically
+// allocated mutexes.  Do not use MutexBase directly.  Instead, write
+// the following to define a static mutex:
+//
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//
+// You can forward declare a static mutex like this:
+//
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// To create a dynamic mutex, just define an object of type Mutex.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock as the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms.  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// An object managed for a thread by a ThreadLocal instance is deleted
+// when the thread exits.  Or, if the ThreadLocal instance dies in
+// that thread, when the ThreadLocal dies.  It's the user's
+// responsibility to ensure that all other threads using a ThreadLocal
+// have exited when it dies, or the per-thread objects for those
+// threads will not be deleted.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : key_(CreateKey()),
+                  default_() {}
+  explicit ThreadLocal(const T& value) : key_(CreateKey()),
+                                         default_(value) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS

<TRUNCATED>

[03/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
new file mode 100644
index 0000000..4ea621d
--- /dev/null
+++ b/src/proto/model.proto
@@ -0,0 +1,382 @@
+package singa;
+enum MsgType{
+  kGet=0;
+  kPut=1;
+  kSync=2;
+  kUpdate=3;
+  kSyncRequest=4;
+  kSyncResponse=5;
+  kStop=6;
+  kData=7;
+  kRGet=8;
+  kRUpdate=9;
+  kConnect=10;
+};
+
+enum EntityType{
+  kWorkerParam=0;
+  kWorkerLayer=1;
+  kServer=2;
+  kStub=3;
+};
+enum Phase {
+  kTrain = 0;
+  kValidation=1;
+  kTest= 2;
+}
+enum ShareOption{
+  kValueOnly=0;
+  kWhole=1;
+};
+message ModelProto{
+  optional string name = 1;
+  // relative path to system folder
+  optional string train_folder=2 [default="train"];
+  optional string test_folder=3 [default="test"];
+  optional string validation_folder=4 [default="validation"];
+  // start display after this num steps
+  optional int32 display_after_steps = 6 [default = 0];
+  // frequency of display
+  optional int32 display_frequency = 7 [default = 0];
+
+  // the time of validation
+  //optional int32 validation_step = 9 [default = 0];
+  // start validation after this num steps
+  optional int32 validation_after_steps = 10 [default = 0];
+  // frequency of validation
+  optional int32 validation_frequency = 11 [default = 0];
+
+  // the time of test
+  //optional int32 test_step = 12 [default = 0];
+  // start test after this num steps
+  optional int32 test_after_steps = 13 [default = 0];
+  // frequency of test
+  optional int32 test_frequency = 14 [default = 0];
+  optional int32 checkpoint_after_steps = 15 [default = 0];
+  // frequency of test
+  optional int32 checkpoint_frequency = 16 [default = 0];
+  optional bool prefetch=18[default=true];
+
+
+  // total num of steps for training
+  optional int32 train_steps = 20;
+  // total num of steps for validation
+  optional int32 validation_steps=21;
+  // total num of steps for test
+  optional int32 test_steps=22;
+  // last snapshot step
+  optional int32 step=29 [default=0];
+
+  optional UpdaterProto updater=31;
+  // There are two basic algorithms for calculating gradients.
+  // Different deep learning models use different algorithms.
+  enum GradCalcAlg{
+    kBackPropagation = 1;
+    kContrastiveDivergence = 2;
+  }
+  optional GradCalcAlg alg= 32 [default = kBackPropagation];
+  optional bool hogwild=33 [default=false];
+  optional NetProto neuralnet = 40;
+  optional bool debug=41 [default=false];
+}
+
+message NetProto{
+  repeated LayerProto layer=1;
+  optional PartitionType partition_type=3 [default=kNone];
+}
+
+message ParamProto {
+  // for the program to identify it and share among layers.
+  // e.g., "conv1_weight","fc_bias"
+  optional string name = 1;
+  optional int32 id=2;
+  // in most situations, user do not need to config this,
+  // the program will calculate it
+  repeated int32 shape = 3;
+
+  // split the parameter into multiple DAryProtos for serialzation and
+  // transferring (Google Protobuf has size limit)
+  optional int32 split_threshold=4 [default=5000000];
+  // partition dimension, -1 for no partition
+  optional int32 partition_dim=5 [default =-1];
+
+  optional int32 version=6;
+
+  // value of the parameter
+  //repeated DAryProto ary = 6;
+
+  enum InitMethod {
+    kConstant = 0;
+    // sample gaussian with std and mean
+    kGaussian = 1;
+    // uniform sampling between low and high
+    kUniform = 2;
+    // copy the content and history which are from previous training
+    kPretrained = 3;
+    // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
+    // Gaussian distribution
+    kGaussainSqrtFanIn = 4;
+    // from Toronto Convnet, rectified linear activation, let
+    // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
+    // the program will multiply it.
+    kUniformSqrtFanIn = 5;
+    // from Theano MLP tutorial, let a=1/sqrt(fan_in+fan_out). for tanh
+    // activation, range is [-6a, +6a], for sigmoid activation, range is
+    // [-24a, +24a], put the scale factor to value field.
+    // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
+    kUniformSqrtFanInOut = 6;
+  }
+  optional InitMethod init_method = 7 [default = kConstant];
+  // constant init
+  optional float value = 8 [default = 1];
+  // for uniform sampling
+  optional float low = 9 [default = -1];
+  optional float high = 10 [default = 1];
+  // for gaussian sampling
+  optional float mean = 11 [default = 0];
+  optional float std = 12 [default = 1];
+  // multiplied on the global learning rate.
+  optional float learning_rate_multiplier =13 [default=1];
+  // multiplied on the global weight decay.
+  optional float weight_decay_multiplier =14 [default=1];
+}
+
+message BlobProtos{
+  repeated BlobProto blobs=1;
+  repeated int32 ids=2;
+  repeated string names=3;
+}
+
+
+
+enum PartitionType{
+  kDataPartition=0;
+  kLayerPartition=1;
+  kNone=2;
+}
+enum ConnectionType{
+  kOneToOne=0;
+  kOneToAll=1;
+}
+
+message LayerProto {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type from the enum above
+  repeated string srclayers=3;
+  optional int32 locationid=4 [default=0]; // todo make locationID an array
+  optional int32 partitionid=5 [default=0];
+  optional PartitionType partition_type=6;
+  // can be pos/neg neuron value for CD, neuron value/grad for BP
+  //repeated DAryProto ary = 10;
+  repeated string share_ary =11;
+  // parameters, e.g., weight matrix or bias vector
+  repeated ParamProto param = 12;
+  // names of parameters shared from other layers
+  repeated string share_param=13;
+
+  // All layers are included in the net structure for training phase by default.
+  // Layers, e.g., computing performance metrics for test phase, can be excluded
+  // by this field which defines in which phase this layer should be excluded.
+  repeated Phase exclude = 20;
+
+  // hyper-parameters for layers
+  optional ConvolutionProto convolution_param = 21;
+  optional ConcateProto concate_param = 31;
+  optional DataProto data_param = 22;
+  optional DropoutProto dropout_param = 23;
+  optional InnerProductProto inner_product_param = 24;
+  optional LRNProto lrn_param = 25;
+  optional MnistProto mnist_param= 26;
+  optional PoolingProto pooling_param = 27;
+  optional SliceProto slice_param = 32;
+  optional SplitProto split_param = 33;
+  optional ReLUProto relu_param = 28;
+  optional RGBImage rgbimage_param=34;
+  optional SoftmaxLossProto softmaxloss_param = 29;
+  optional TanhProto tanh_param=30;
+}
+message RGBImage {
+  optional float scale=1 [default=1.0];
+  optional int32 cropsize=2 [default=0];
+  optional bool mirror=3 [default=false];
+  optional string meanfile=4;
+}
+message SplitProto{
+  optional int32 num_splits=1;
+}
+// scaled tan: A*tan(B*x)
+message TanhProto{
+  optional float outer_scale=1 [default=1.0];
+  optional float inner_scale=2 [default=1.0];
+}
+
+// Message that stores parameters used by SoftmaxLossProto
+message SoftmaxLossProto {
+  // accuracy is not comptued by default, unless topk>0;
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.
+  optional int32 topk = 1 [default=1] ;
+  optional float scale=2 [default=1];
+}
+// Message that stores parameters used by ConvolutionLayer
+message ConvolutionProto {
+  optional uint32 num_filters = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 stride = 4 [default = 1]; // The stride (equal in Y, X)
+  required uint32 kernel= 5; // The kernel height/width
+}
+
+message ConcateProto{
+  optional int32 concate_dimension=1;
+  optional int32 concate_num=2;
+}
+
+// Message that stores parameters used by DataLayer
+message DataProto {
+  // Specify the data source.
+  optional string source = 1;
+  // path to the data file/folder, absolute or relative to the
+  // ClusterProto::workspace
+  optional string path=2;
+  // Specify the batch size.
+  optional uint32 batchsize = 4;
+  // skip [0,random_skip] records
+  optional uint32 random_skip=5 [default=0];
+}
+
+message MnistProto {
+  // elastic distortion
+  optional int32 kernel=1 [default=0];
+  optional float sigma=2 [default=0];
+  optional float alpha=3 [default=0];
+  // rotation or horizontal shearing
+  optional float beta=4 [default=0];
+  // scaling
+  optional float gamma=5 [default=0];
+  // scale to this size as input for deformation
+  optional int32 resize=6 [default=0] ;
+  optional int32 elastic_freq=7 [default=0];
+  optional float norm_a=8 [default=1];
+  optional float norm_b=9 [default=0];
+}
+// Message that stores parameters used by DropoutLayer
+message DropoutProto {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+// Message that stores parameters used by InnerProductLayer
+message InnerProductProto {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNProto {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float knorm =5 [default=1.0];
+}
+
+// Message that stores parameters used by PoolingLayer
+message PoolingProto {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  required uint32 kernel= 2; // The kernel size (square)
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+}
+
+message SliceProto{
+  optional int32 slice_dimension=1;
+  optional int32 slice_num=2;
+}
+// Message that stores parameters used by ReLULayer
+message ReLUProto {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+}
+
+
+
+message Record {
+  enum Type{
+    kSingleLabelImage=0;
+  }
+  optional Type type=1 [default=kSingleLabelImage];
+  optional SingleLabelImageRecord image=2;
+}
+
+// to import caffe's lmdb dataset
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+message SingleLabelImageRecord{
+  repeated int32 shape=1;
+  optional int32 label=2;
+  optional bytes pixel=3;
+  repeated float data=4;
+}
+
+message UpdaterProto {
+  optional float momentum=4 [default=0];
+  optional float weight_decay=5 [default=0];
+  // used in changing learning rate
+  optional float gamma = 6 [default=1];
+  optional float pow=7 [default=0];
+  optional float delta=8 [default=0.0000001];
+  optional float rho=9 [default=0.9];
+  optional float base_learning_rate=12;
+  optional float final_learning_rate=13;
+  optional int32 learning_rate_change_frequency = 14;
+  enum ChangeProto {
+    kFixed = 0;
+    kInverse_t= 1;
+    kInverse= 2;
+    kExponential = 3;
+    kLinear = 4;
+    kStep = 5;
+    kFixedStep=6;
+  }
+  optional ChangeProto learning_rate_change_method = 16 [default = kFixed];
+  optional int32 sync_frequency=17 [default=1];
+  // warmup the parameters and then send to parameter servers.
+  optional int32 warmup_steps=25 [default=10];
+  optional float moving_rate=26 [default=0];
+  optional string param_type=27[default="Param"];
+  repeated int32 step=28;
+  repeated float step_lr=29;
+}
+message BlobProto {
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_consistency.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_consistency.cc b/src/test/dist_test/test_consistency.cc
new file mode 100644
index 0000000..a4ed9b2
--- /dev/null
+++ b/src/test/dist_test/test_consistency.cc
@@ -0,0 +1,406 @@
+//  Copyright © 2014 Anh Dinh. All Rights Reserved.
+
+//  Testing the unbalance in spliting parameter vectors.
+
+#include "core/global-table.h"
+#include "core/common.h"
+#include "core/disk-table.h"
+#include "core/table.h"
+#include "core/table_server.h"
+#include "utils/global_context.h"
+#include <gflags/gflags.h>
+#include "proto/model.pb.h"
+#include "proto/common.pb.h"
+#include "worker.h"
+#include "coordinator.h"
+#include "utils/common.h"
+#include "utils/proto_helper.h"
+
+#include <cmath>
+#include <stdlib.h>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+
+DEFINE_bool(restore_mode, false, "restore from checkpoint file");
+using namespace lapis;
+using std::vector;
+
+//DEFINE_bool(sync_update, false, "Synchronous put/update queue");
+DEFINE_int32(checkpoint_frequency, 5000, "frequency for cp");
+DEFINE_int32(checkpoint_after, 1, "cp after this steps");
+DEFINE_string(par_mode, "hybrid",  "time training algorithm");
+DEFINE_bool(restore, false, "restore from checkpoint file");
+
+DEFINE_string(db_backend, "lmdb", "backend db");
+DEFINE_string(system_conf, "examples/imagenet12/system.conf", "configuration file for node roles");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf", "DL model configuration file");
+DEFINE_string(checkpoint_dir,"/data1/wangwei/lapis/","check point dir");
+DEFINE_int32(threshold,1000000, "max # of parameters in a vector");
+DEFINE_int32(iterations,5,"numer of get/put iterations");
+DEFINE_int32(workers,2,"numer of workers doing get/put");
+DECLARE_bool(checkpoint_enabled);
+
+#ifndef FLAGS_v
+  DEFINE_int32(v, 3, "vlog controller");
+#endif
+
+
+struct AnhUpdateHandler: BaseUpdateHandler<VKey,SGDValue>{
+	bool Update(SGDValue *a, const SGDValue &b){
+    float * adptr=a->mutable_data()->mutable_value()->mutable_data();
+    const float*bdptr=b.grad(0).value().data();
+    for(int i=0;i<b.grad(0).value_size();i++)
+      adptr[i]+=bdptr[i];
+		return true;
+	}
+
+  bool Get(const VKey k, const SGDValue &val, SGDValue *ret){
+      *ret = val;
+      return true;
+  }
+
+  bool is_checkpointable(const VKey k, const SGDValue v){
+  	return true; //always checkpoint
+  }
+};
+
+typedef map<int, GlobalTable*> Map;
+Map tables;
+shared_ptr<NetworkThread> network;
+shared_ptr<GlobalContext> context;
+std::vector<ServerState*> server_states;
+TableServer *table_server;
+TableDelegate *delegate;
+void create_mem_table(int id, int num_shards){
+
+	TableDescriptor *info = new TableDescriptor(id, num_shards);
+	  info->key_marshal = new Marshal<VKey>();
+	  info->value_marshal = new Marshal<SGDValue>();
+	  info->sharder = new VKeySharder;
+	  info->accum = new AnhUpdateHandler;
+	  info->partition_factory = new typename SparseTable<VKey, SGDValue>::Factory;
+	  auto table=new TypedGlobalTable<VKey, SGDValue>();
+	  table->Init(info);
+	  tables[id] = table;
+}
+
+void coordinator_assign_tables(int id){
+	for (int i = 0; i < context->num_procs() 	; ++i) {
+	    RegisterWorkerRequest req;
+	    int src = 0;
+	    //  adding memory server.
+	    if (context->IsTableServer(i)) {
+	      network->Read(MPI::ANY_SOURCE, MTYPE_REGISTER_WORKER, &req, &src);
+	      server_states.push_back(new ServerState(i));
+	    }
+	  }
+	  LOG(INFO) << " All servers registered and started up. Ready to go";
+	  //  set itself as the current worker for the table
+	  tables[id]->worker_id_ = network->id();
+
+	  // memory servers are specified in global context. Round-robin assignment
+
+	    VLOG(3)<<"num of shards"<<tables[id]->num_shards()<<" for table"<< id;
+
+	    int server_idx = 0;
+	    for (int shard = 0; shard < tables[id]->num_shards(); ++shard) {
+	      ServerState &server = *server_states[server_idx];
+	      LOG(INFO) << "Assigning table ("<<id<<","<<shard<<") to server "
+	                <<server_states[server_idx]->server_id;
+
+	      // TODO(Anh) may overwrite this field if #shards>#table_servers
+	      server.shard_id = shard;
+	      server.local_shards.insert(new TaskId(id, shard));
+	      server_idx = (server_idx + 1) % server_states.size();
+	    }
+
+	  VLOG(3)<<"table assignment";
+	  //  then send table assignment
+	  ShardAssignmentRequest req;
+	  for (size_t i = 0; i < server_states.size(); ++i) {
+	    ServerState &server = *server_states[i];
+	    for (auto * task: server.local_shards) {
+	      ShardAssignment *s  = req.add_assign();
+	      s->set_new_worker(server.server_id);
+	      s->set_table(task->table);
+	      s->set_shard(task->shard);
+	      //  update local tables
+	      CHECK(tables.find(task->table)!=tables.end());
+	      GlobalTable *t = tables.at(task->table);
+	      t->get_partition_info(task->shard)->owner = server.server_id;
+	      delete task;
+	    }
+	  }
+	  VLOG(3)<<"finish table assignment, req size "<<req.assign_size();
+	  network->SyncBroadcast(MTYPE_SHARD_ASSIGNMENT, MTYPE_SHARD_ASSIGNMENT_DONE, req);
+	  VLOG(3)<<"finish table server init";
+}
+
+
+void worker_table_init(){
+	table_server = new TableServer();
+	table_server->StartTableServer(tables);
+	VLOG(3) << "done starting table server";
+}
+
+double random_double(){
+	return static_cast<double>(rand())/static_cast<double>(RAND_MAX);
+}
+
+// popular table with random large or small messages.
+// the message distribution specified in FLAGS_large_precentage
+void coordinator_load_data(const vector<int>& tuples){
+  auto table = static_cast<TypedGlobalTable<VKey,SGDValue>*>(tables[0]);
+
+  int nservers=context->num_table_servers();
+  int keyid=0;
+  if (!FLAGS_restore_mode){
+    for(auto tuple: tuples){
+      for(int offset=0;offset<tuple;){
+        SGDValue x;
+        DAryProto *data=x.mutable_data();
+        DAryProto *grad=x.add_grad();
+        for(int i=0;i <std::min(FLAGS_threshold, tuple-offset);i++){
+          data->add_value(i*1.0f);
+          grad->add_value(i*1.0f);
+        }
+        offset+=data->value_size();
+        VKey key;
+        key.set_key(keyid++);
+        table->put(key,x);
+      }
+    }
+    LOG(ERROR)<<"put "<<keyid<<" tuples";
+  }
+
+  /*
+	LogFile *file = new LogFile("/data1/wangwei/lapis/checkpoint_0","rw",0);
+	VLOG(3) << "Loaded table " << file->file_name();
+	string k,v;
+	int table_size = file->read_latest_table_size();
+	VLOG(3) << "table size = " << table_size;
+	for (int i=0; i<table_size; i++){
+		int tmp;
+		file->previous_entry(&k, &v, &tmp);
+		int *key = reinterpret_cast<int *>((char*)&k[0]);
+		int *val = reinterpret_cast<int *>((char*)&v[0]);
+		VLOG(3) << "k = " << *key << " val = " << *val;
+	}
+	delete file;
+  */
+
+	/*
+	for (int i=0; i<num_keys; i++){
+		table->put(i,0); //loaded again
+	}*/
+	VLOG(3) << "Coordinator done loading ..., from process "<<NetworkThread::Get()->id();
+}
+
+void get(TypedGlobalTable<VKey,SGDValue>* table, const vector<int>& tuples){
+  SGDValue v;
+  int num_keys=0;
+  for(auto tuple: tuples){
+    num_keys+=tuple/FLAGS_threshold+(tuple%FLAGS_threshold!=0);
+  }
+  LOG(ERROR)<<"getting "<<num_keys<<" tuples";
+
+  for (int i=0; i<num_keys; i++){
+    VKey key;
+    key.set_key(i);
+    table->async_get(key, &v);
+  }
+
+
+  int key=0;
+  SGDValue val;
+
+  LOG(INFO)<<"start collect key";
+  for (int i=0; i<num_keys; i++){
+    VKey key;
+    while(!table->async_get_collect(&key, &val))
+      Sleep(0.001);
+    //LOG(INFO)<<"collect key "<<key<<" with val "<<val;
+  }
+}
+
+void update(TypedGlobalTable<VKey,SGDValue>* table, const vector<int>& tuples){
+  if(NetworkThread::Get()->id()==0)
+    sleep(2);
+  LOG(INFO)<<"start update";
+  int keyid=0;
+  for(auto tuple: tuples){
+    for(int offset=0;offset<tuple;){
+      SGDValue x;
+      DAryProto *grad=x.add_grad();
+      for(int i=0;i <std::min(FLAGS_threshold, tuple-offset);i++){
+        grad->add_value(i*1.0f);
+      }
+      offset+=grad->value_size();
+      VKey key;
+      key.set_key(keyid++);
+      table->update(key,x);
+    }
+  }
+  LOG(ERROR)<<"updated "<<keyid<<" tuples";
+}
+
+void worker_test_data(const vector<int>& tuples){
+  auto table = static_cast<TypedGlobalTable<VKey,SGDValue>*>(tables[0]);
+
+  get(table, tuples);
+  update(table, tuples);
+  update(table, tuples);
+  update(table, tuples);
+  get(table, tuples);
+}
+
+void shutdown(){
+	if (context->AmICoordinator()){
+		EmptyMessage msg;
+		for (int i=0; i<context->num_procs()-1; i++)
+			network->Read(MPI::ANY_SOURCE, MTYPE_WORKER_END, &msg);
+		 EmptyMessage shutdown_msg;
+		  for (int i = 0; i < network->size() - 1; i++) {
+		    network->Send(i, MTYPE_SHUTDOWN, shutdown_msg);
+		  }
+		  network->Flush();
+		  network->Shutdown();
+	}
+	else{
+	  network->Flush();
+
+	  network->Send(context->num_procs()-1, MTYPE_WORKER_END, EmptyMessage());
+
+	  EmptyMessage msg;
+
+	  network->Read(context->num_procs()-1, MTYPE_SHUTDOWN, &msg);
+
+	  if (context->AmITableServer())
+		  table_server->ShutdownTableServer();
+
+	  network->Shutdown();
+	}
+}
+
+void HandleShardAssignment() {
+
+  ShardAssignmentRequest shard_req;
+  auto mpi=NetworkThread::Get();
+  mpi->Read(GlobalContext::kCoordinator, MTYPE_SHARD_ASSIGNMENT, &shard_req);
+  //  request read from coordinator
+  for (int i = 0; i < shard_req.assign_size(); i++) {
+    const ShardAssignment &a = shard_req.assign(i);
+    GlobalTable *t = tables.at(a.table());
+    t->get_partition_info(a.shard())->owner = a.new_worker();
+
+
+    //if local shard, create check-point files
+    if (FLAGS_checkpoint_enabled && t->is_local_shard(a.shard())){
+      string checkpoint_file = StringPrintf("%s/checkpoint_%d",FLAGS_checkpoint_dir.c_str(), a.shard());
+        char hostname[256];
+        gethostname(hostname, sizeof(hostname));
+        VLOG(3) << "try to open for writing *****"<<checkpoint_file<<" "<<string(hostname);
+
+      FILE *tmp_file = fopen(checkpoint_file.c_str(), "r");
+      if (tmp_file){//exists -> open to reading and writing
+        fclose(tmp_file);
+        auto cp = t->checkpoint_files();
+
+        if (FLAGS_restore_mode){//open in read mode to restore, then close
+          LogFile *file = new LogFile(checkpoint_file,"rw",0);
+          VLOG(3) << "Loaded table " << file->file_name();
+          int table_size = file->read_latest_table_size();
+          delete file;
+
+          double start=Now();
+          VLOG(3) << "Open checkpoint file to restore";
+          (*cp)[a.shard()] = new LogFile(checkpoint_file,"r",a.shard());
+          t->Restore(a.shard());
+          delete (*cp)[a.shard()];
+          double end=Now();
+          LOG(ERROR)<<"restore time\t"<<end-start<< "\tfor\t"
+            <<table_size<<"\tthreshold\t"<<FLAGS_threshold;
+        }
+        char hostname[256];
+        gethostname(hostname, sizeof(hostname));
+        VLOG(3) << "open for writing *****"<<checkpoint_file<<" "<<string(hostname);
+
+
+
+        VLOG(3) << "Open checkpoint file for writing";
+        (*cp)[a.shard()] = new LogFile(checkpoint_file,"a",a.shard());
+      }
+      else{// not exist -> open to writing first time
+        auto cp = t->checkpoint_files();
+        (*cp)[a.shard()] = new LogFile(checkpoint_file,"w",a.shard());
+        VLOG(3) << "Added to new checkpoint files for shard "<< a.shard();
+      }
+
+    }
+
+
+  }
+  EmptyMessage empty;
+  mpi->Send(GlobalContext::kCoordinator, MTYPE_SHARD_ASSIGNMENT_DONE, empty);
+  VLOG(3)<<"finish handle shard assignment **";
+
+}
+
+
+int main(int argc, char **argv) {
+	FLAGS_logtostderr = 1;
+	int provided;
+	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+	context = GlobalContext::Get(FLAGS_system_conf);
+	network = NetworkThread::Get();
+
+	ModelProto model;
+	ReadProtoFromTextFile(FLAGS_model_conf.c_str(), &model);
+
+	create_mem_table(0,context->num_table_servers());
+
+  vector<int> tuple_size{37448736, 16777216, 4096000, 1327104, 884736, 884736, 614400,14112,4096,4096,1000,384,384,256,256,96};
+  /*
+  vector<int> tuples;
+  for(int i=0;i<3;i++){
+    for(int j=0;j<FLAGS_workers;j++)
+      tuples.push_back(tuple_size[i]/FLAGS_workers);
+  }
+  for(int i=3;i<tuple_size.size();i++)
+    tuples.push_back(tuple_size[i]);
+    */
+
+	if (context->AmICoordinator()){
+		VLOG(3) << "Coordinator process rank = " << NetworkThread::Get()->id();
+		coordinator_assign_tables(0);
+		coordinator_load_data(tuple_size);
+
+		network->barrier();
+	}
+	else{
+		if (context->AmITableServer()){
+			worker_table_init();
+			HandleShardAssignment();
+			network->barrier();
+		}
+		else{
+			VLOG(3) << "Inside worker, waiting for assignemtn";
+			HandleShardAssignment();
+			network->barrier();
+      if(!FLAGS_restore_mode)
+        worker_test_data(tuple_size);
+		}
+	}
+	shutdown();
+
+
+	VLOG(3) << "Done ...";
+	return 0;
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_core.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_core.cc b/src/test/dist_test/test_core.cc
new file mode 100644
index 0000000..35d589b
--- /dev/null
+++ b/src/test/dist_test/test_core.cc
@@ -0,0 +1,192 @@
+//  Copyright © 2014 Anh Dinh. All Rights Reserved.
+
+
+
+#include "core/global-table.h"
+#include "core/common.h"
+#include "core/disk-table.h"
+#include "core/table.h"
+#include "core/table_server.h"
+#include "utils/global_context.h"
+#include <gflags/gflags.h>
+#include "proto/model.pb.h"
+#include "worker.h"
+#include "coordinator.h"
+#include "model_controller/myacc.h"
+#include <cmath>
+
+using namespace lapis;
+
+DEFINE_bool(sync_update, false, "Synchronous put/update queue");
+DEFINE_string(system_conf, "examples/imagenet12/system.conf", "configuration file for node roles");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf", "DL model configuration file");
+DEFINE_int32(num_keys,10,"");
+
+typedef map<int, GlobalTable*> Map;
+Map tables;
+shared_ptr<NetworkThread> network;
+shared_ptr<GlobalContext> context;
+std::vector<ServerState*> server_states;
+TableServer *table_server;
+
+void create_mem_table(int id, int num_shards){
+
+	TableDescriptor *info = new TableDescriptor(id, num_shards);
+	  info->key_marshal = new Marshal<int>();
+	  info->value_marshal = new Marshal<int>();
+	  info->sharder = new Sharding::Mod;
+	  info->accum = new TestUpdater();
+	  info->partition_factory = new typename SparseTable<int, int>::Factory;
+	  auto table=new TypedGlobalTable<int, int>();
+	  table->Init(info);
+	  tables[id] = table;
+}
+
+void coordinator_assign_tables(int id){
+	for (int i = 0; i < context->num_processes()-1; ++i) {
+	    RegisterWorkerRequest req;
+	    int src = 0;
+	    network->Read(MPI::ANY_SOURCE, MTYPE_REGISTER_WORKER, &req, &src);
+	    //  adding memory server.
+	    if (context->IsTableServer(i)) {
+	      server_states.push_back(new ServerState(i));
+	    }
+	  }
+	  LOG(INFO) << " All servers registered and started up. Ready to go";
+	  //  set itself as the current worker for the table
+	  tables[id]->worker_id_ = network->id();
+
+	  // memory servers are specified in global context. Round-robin assignment
+
+	    VLOG(3)<<"num of shards"<<tables[id]->num_shards()<<" for table"<< id;
+
+	    int server_idx = 0;
+	    for (int shard = 0; shard < tables[id]->num_shards(); ++shard) {
+	      ServerState &server = *server_states[server_idx];
+	      LOG(INFO) << "Assigning table ("<<id<<","<<shard<<") to server "
+	                <<server_states[server_idx]->server_id;
+
+	      // TODO(Anh) may overwrite this field if #shards>#table_servers
+	      server.shard_id = shard;
+	      server.local_shards.insert(new TaskId(id, shard));
+	      server_idx = (server_idx + 1) % server_states.size();
+	    }
+
+	  VLOG(3)<<"table assignment";
+	  //  then send table assignment
+	  ShardAssignmentRequest req;
+	  for (size_t i = 0; i < server_states.size(); ++i) {
+	    ServerState &server = *server_states[i];
+	    for (auto * task: server.local_shards) {
+	      ShardAssignment *s  = req.add_assign();
+	      s->set_new_worker(server.server_id);
+	      s->set_table(task->table);
+	      s->set_shard(task->shard);
+	      //  update local tables
+	      CHECK(tables.find(task->table)!=tables.end());
+	      GlobalTable *t = tables.at(task->table);
+	      t->get_partition_info(task->shard)->owner = server.server_id;
+	      delete task;
+	    }
+	  }
+	  VLOG(3)<<"finish table assignment, req size "<<req.assign_size();
+	  network->SyncBroadcast(MTYPE_SHARD_ASSIGNMENT, MTYPE_SHARD_ASSIGNMENT_DONE, req);
+	  VLOG(3)<<"finish table server init";
+}
+
+void worker_table_init(){
+	table_server = new TableServer();
+	table_server->StartTableServer(tables);
+	VLOG(3) << "done starting table server";
+}
+
+
+void coordinator_load_data(){
+	auto table = static_cast<TypedGlobalTable<int,int>*>(tables[0]);
+	for (int i = 1; i<=FLAGS_num_keys; i++){
+		table->put(i,i);
+	}
+	VLOG(3) << "Loaded data successfully ...";
+}
+
+void worker_test_data(){
+	auto table = static_cast<TypedGlobalTable<int,int>*>(tables[0]);
+	for (int i=1; i<=FLAGS_num_keys; i++)
+		VLOG(3) << StringPrintf("Worker %d got (%d,%d)", NetworkThread::Get()->id(), i, table->get(i));
+
+
+	for (int j = 0; j < 2; j++) {
+		for (int i = 1; i <= FLAGS_num_keys; i++)
+			table->update(i, i);
+
+		for (int i = 1; i <= FLAGS_num_keys; i++)
+			VLOG(3)
+					<< StringPrintf("Worker %d got (%d,%d)",
+							NetworkThread::Get()->id(), i, table->get(i));
+	}
+/*
+	for (int i = 1; i <= FLAGS_num_keys; i++)
+				VLOG(3)
+						<< StringPrintf("Worker %d got (%d,%d)",
+
+							NetworkThread::Get()->id(), i, table->get(i));
+*/
+}
+
+void shutdown(){
+	if (context->AmICoordinator()){
+		VLOG(3) << "Coordinator is shutting down ...";
+		EmptyMessage msg;
+		for (int i=0; i<context->num_processes()-1; i++)
+			network->Read(MPI::ANY_SOURCE, MTYPE_WORKER_END, &msg);
+		 EmptyMessage shutdown_msg;
+		  for (int i = 0; i < network->size() - 1; i++) {
+		    network->Send(i, MTYPE_WORKER_SHUTDOWN, shutdown_msg);
+		  }
+		  network->Flush();
+		  network->Shutdown();
+	}
+	else{
+		VLOG(3) << "Worker " << network->id() << " is shutting down ...";
+	  network->Flush();
+	  VLOG(3) << "Done flushing the network thread";
+	  network->Send(GlobalContext::kCoordinatorRank, MTYPE_WORKER_END, EmptyMessage());
+	  EmptyMessage msg;
+	  network->Read(GlobalContext::kCoordinatorRank, MTYPE_WORKER_SHUTDOWN, &msg);
+	  VLOG(3) << "Worker received MTYPE_WORKER_SHUTDOWN";
+	  table_server->ShutdownTableServer();
+	  VLOG(3) << "Flushing node " << network->id();
+	  network->Shutdown();
+	}
+}
+
+
+int main(int argc, char **argv) {
+	FLAGS_logtostderr = 1;
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+	context = GlobalContext::Get(FLAGS_system_conf, FLAGS_model_conf);
+	network = NetworkThread::Get();
+	VLOG(3) << "*** testing memory servers, with "
+			<< context->num_table_servers() << " servers";
+	create_mem_table(0,context->num_table_servers());
+
+	if (context->AmICoordinator()){
+		coordinator_assign_tables(0);
+		coordinator_load_data();
+		network->barrier();
+	}
+	else{
+		worker_table_init();
+		network->barrier();
+		VLOG(3) << "passed the barrier";
+		//Sleep(1);
+		worker_test_data();
+	}
+
+	shutdown();
+	return 0;
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_da.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_da.cc b/src/test/dist_test/test_da.cc
new file mode 100644
index 0000000..51aa93e
--- /dev/null
+++ b/src/test/dist_test/test_da.cc
@@ -0,0 +1,700 @@
+#include <glog/logging.h>
+#include <mpi.h>
+#include <utility>
+#include <vector>
+
+#include "da/gary.h"
+#include "da/dary.h"
+#include "da/ary.h"
+
+
+using std::make_pair;
+using std::vector;
+void Debug() {
+  int i = 0;
+  char hostname[256];
+  gethostname(hostname, sizeof(hostname));
+  printf("PID %d on %s ready for attach\n", getpid(), hostname);
+  fflush(stdout);
+  while (0 == i)
+    sleep(5);
+}
+
+
+
+void TestPar(int pdim, int rank){
+  lapis::DAry a1, a2;
+  lapis::DAry a3, a4;
+  vector<lapis::Range> slice{make_pair(0,4), make_pair(0,8)};
+  a1.SetShape({4,8});
+  a2.SetShape({4,8});
+  a1.Setup(pdim);
+  a2.Setup(pdim);
+  a1.Random();
+  a2.Random();
+  ARMCI_Barrier();
+
+
+  if(rank==0){
+    //Debug();
+    LOG(ERROR)<<"test simple partition along "<< pdim<<" dim";
+    a3=a1.Fetch(slice);
+    a4=a2.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a3.ToString();
+    LOG(ERROR)<<"fetch b";
+    LOG(ERROR)<<a4.ToString();
+    a3.Add(a4);
+    LOG(ERROR)<<"a<- a+b";
+    LOG(ERROR)<<a3.ToString();
+  }
+  ARMCI_Barrier();
+  a1.Add(a2);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a1.Fetch(slice);
+    LOG(ERROR)<<"add then fetch";
+    LOG(ERROR)<<a5.ToString();
+  }
+}
+
+
+
+void TestMixedParElt(int pa, int pb, int pc, int rank){
+  LOG(ERROR)<<" p dim for a,b,c is "<<pa<<" "<<pb<<" "<<pc;
+  vector<lapis::Range> slice{make_pair(0,3),make_pair(0,6), make_pair(0,2)};
+  lapis::DAry a1, a2, a3;
+  a1.SetShape({3,6,2});
+  a2.SetShape({3,6,2});
+  a3.SetShape({3,6,2});
+  a1.Setup(pa);
+  a2.Setup(pb);
+  a3.Setup(pc);
+  a1.Random();
+  a2.Random();
+  a3.Random();
+
+  ARMCI_Barrier();
+  if(rank==0){
+    LOG(ERROR)<<"test elementwise ops with mixed partition";
+    lapis::DAry a5, a4;
+//    Debug();
+    a5=a1.Fetch(slice);
+    a4=a2.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a5.ToString();
+    LOG(ERROR)<<"fetch b";
+    LOG(ERROR)<<a4.ToString();
+    a5.Copy(a4);
+    LOG(ERROR)<<"fetch op a.Copy(b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a1.Copy(a2);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a1.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.Copy(b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    //Debug();
+    a8=a1.Fetch(slice);
+    a4=a2.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a8.ToString();
+    LOG(ERROR)<<"fetch b";
+    LOG(ERROR)<<a4.ToString();
+    a5.Mult(a8,a4);
+    LOG(ERROR)<<"fetch op c.mult(a,b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Mult(a1,a2);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.Mult(b,c)";
+    LOG(ERROR)<<a5.ToString();
+  }
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    //Debug();
+    a8=a1.Fetch(slice);
+    a4=a2.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a8.ToString();
+    LOG(ERROR)<<"fetch b";
+    LOG(ERROR)<<a4.ToString();
+    a5.Div(a8,a4);
+    LOG(ERROR)<<"fetch op c.div(a,b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Div(a1,a2);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.div(b,c)";
+    LOG(ERROR)<<a5.ToString();
+  }
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    //Debug();
+    a8=a1.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a8.ToString();
+    a5.Mult(a8, 3.0);
+    LOG(ERROR)<<"fetch op c.mult(a,3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Mult(a1,3.0);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.mult(b,3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    //Debug();
+    a8=a1.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a8.ToString();
+    a5.Square(a8);
+    LOG(ERROR)<<"fetch op c.square(a)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Square(a1);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.sqaure(b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    //Debug();
+    a8=a1.Fetch(slice);
+    LOG(ERROR)<<"fetch a";
+    LOG(ERROR)<<a8.ToString();
+    a5.Pow(a8,3.0);
+    LOG(ERROR)<<"fetch op c.pow(a, 3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Pow(a1,3.0);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.pow(b,3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  a3.SampleUniform(0.0,3.0);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.uniform(0,3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  a3.SampleGaussian(0.0,1.0);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.norm(0,1)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  a3.Fill(1.43);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch a.fill(1.43)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  a1.Random();
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    a4=a1.Fetch(slice);
+    a5.Threshold(a4,0.3);
+    LOG(ERROR)<<"fetch op b=threshold(a,0.3)";
+    LOG(ERROR)<<a4.ToString();
+    LOG(ERROR)<<a5.ToString();
+  }
+
+  ARMCI_Barrier();
+  a3.Threshold(a1, .30f);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch b=threshold(a,0.3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  a1.Random();
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a8, a4, a5({3,6,2});
+    a4=a1.Fetch(slice);
+    a5.Max(a4,0.3);
+    LOG(ERROR)<<"fetch op b=max(a,0.3)";
+    LOG(ERROR)<<a4.ToString();
+    LOG(ERROR)<<a5.ToString();
+  }
+
+  ARMCI_Barrier();
+  a3.Max(a1, .30f);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch b=max(a,0.3)";
+    LOG(ERROR)<<a5.ToString();
+  }
+
+
+//////////////////////////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry a6, a4, a5({3,6,2});
+    a6=a1.Fetch(slice);
+    a4=a2.Fetch(slice);
+    a5.Map([](float a, float b) {return a+2*b;}, a6,a4);
+    LOG(ERROR)<<"fetch op b=map(a+2b)";
+    LOG(ERROR)<<a6.ToString();
+    LOG(ERROR)<<a4.ToString();
+    LOG(ERROR)<<a5.ToString();
+  }
+  ARMCI_Barrier();
+  a3.Map([](float a, float b) {return a+2*b;}, a1,a2);
+  if(rank==0){
+    lapis::DAry a5;
+    a5=a3.Fetch(slice);
+    LOG(ERROR)<<"op fetch b=map(a+2b)";
+    LOG(ERROR)<<a5.ToString();
+  }
+  LOG(ERROR)<<"finish elementwise ops";
+}
+
+
+void TestLargeDot(int pa, int pb, int pc, int rank){
+  if(rank==0){
+    LOG(ERROR)<<"test Dot, partition for a, b, c : "
+      << pa<<" "<<pb<<" "<<pc<<" dim";
+  }
+
+  double t1, t2, t3;
+  t1=MPI_Wtime();
+  lapis::DAry a,b,c;
+  a.SetShape({256,9216});
+  b.SetShape({9216,4096});
+  c.SetShape({256,4096});
+  a.Setup(pa);
+  b.Setup(pb);
+  c.Setup(pc);
+  a.Random();
+  b.Random();
+  c.Random();
+  ARMCI_Barrier();
+  t2=MPI_Wtime();
+  c.Dot(a,b);
+  t3=MPI_Wtime();
+  ARMCI_Barrier();
+  LOG(ERROR)<<"setup time: "<<t2-t1<<" dot time: "
+    <<t3-t2<<" wait time:"<<MPI_Wtime()-t3;
+}
+
+void TestDot(int pa, int pb, int pc, int rank){
+  vector<lapis::Range> slicea{make_pair(0,4), make_pair(0,8)};
+  vector<lapis::Range> sliceb{make_pair(0,8), make_pair(0,4)};
+  vector<lapis::Range> slicec{make_pair(0,4), make_pair(0,4)};
+  lapis::DAry a,b,c;
+  a.SetShape({4,8});
+  b.SetShape({8,4});
+  c.SetShape({4,4});
+  a.Setup(pa);
+  b.Setup(pb);
+  c.Setup(pc);
+  a.Random();
+  b.Random();
+  c.Random();
+  //////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    LOG(ERROR)<<"test Dot, partition for a, b, c : "
+      << pa<<" "<<pb<<" "<<pc<<" dim";
+    LOG(ERROR)<<"c=a*b";
+    lapis::DAry x,y,z;
+    x=a.Fetch(slicea);
+    y=b.Fetch(sliceb);
+    z=c.Fetch(slicec);
+    z.Dot(x,y);
+    LOG(ERROR)<<"fetch dot ";
+    LOG(ERROR)<<z.ToString();
+  }
+  ARMCI_Barrier();
+  //Debug();
+  c.Dot(a,b);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry z;
+    z=c.Fetch(slicec);
+    LOG(ERROR)<<"dot fetch";
+    LOG(ERROR)<<z.ToString();
+  }
+  /////////////////////////////
+  ARMCI_Barrier();
+
+  if(rank==0){
+    LOG(ERROR)<<"a=c*b^T";
+    lapis::DAry x,y,z;
+    x=a.Fetch(slicea);
+    y=b.Fetch(sliceb);
+    z=c.Fetch(slicec);
+    x.Dot(z,y, false, true);
+    LOG(ERROR)<<"fetch dot ";
+    LOG(ERROR)<<x.ToString();
+  }
+  ARMCI_Barrier();
+  //Debug();
+  a.Dot(c,b, false, true);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry z;
+    z=a.Fetch(slicea);
+    LOG(ERROR)<<"dot fetch";
+    LOG(ERROR)<<z.ToString();
+  }
+
+  /////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    LOG(ERROR)<<"b=a^T*c";
+    lapis::DAry x,y,z;
+    x=a.Fetch(slicea);
+    y=b.Fetch(sliceb);
+    z=c.Fetch(slicec);
+    y.Dot(x,z, true, false);
+    LOG(ERROR)<<"fetch dot ";
+    LOG(ERROR)<<y.ToString();
+  }
+  ARMCI_Barrier();
+  //Debug();
+  b.Dot(a,c, true, false);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry z;
+    z=b.Fetch(sliceb);
+    LOG(ERROR)<<"dot fetch";
+    LOG(ERROR)<<z.ToString();
+  }
+  ARMCI_Barrier();
+  /////////////////////////////
+  ARMCI_Barrier();
+  if(rank==0){
+    LOG(ERROR)<<"b=a^T*c^T";
+    lapis::DAry x,y,z;
+    x=a.Fetch(slicea);
+    y=b.Fetch(sliceb);
+    z=c.Fetch(slicec);
+    y.Dot(x,z, true, true);
+    LOG(ERROR)<<"fetch dot ";
+    LOG(ERROR)<<y.ToString();
+  }
+  ARMCI_Barrier();
+  //Debug();
+  b.Dot(a,c, true, true);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry z;
+    z=b.Fetch(sliceb);
+    LOG(ERROR)<<"dot fetch";
+    LOG(ERROR)<<z.ToString();
+  }
+  ARMCI_Barrier();
+}
+
+
+void TestSubarray(int pa, int pb, int pc, int rank){
+  vector<lapis::Range> slicea{make_pair(0,4), make_pair(0,8)};
+  vector<lapis::Range> sliceb{make_pair(0,8), make_pair(0,4)};
+  vector<lapis::Range> slicec{make_pair(0,4), make_pair(0,4)};
+  vector<lapis::Range> slice{make_pair(0,4)};
+  lapis::DAry a,b,c;
+  a.SetShape({4});
+  b.SetShape({8,4});
+  c.SetShape({4,4});
+  a.Setup(pa);
+  b.Setup(pb);
+  c.Setup(pc);
+  b.Random();
+  c.Random();
+
+  //Debug();
+  lapis::DAry sb=b[2];
+  lapis::DAry sc=c[3];
+
+  ARMCI_Barrier();
+  if(rank==0){
+    LOG(ERROR)<<"test subary, partition for a, b, c : "
+      << pa<<" "<<pb<<" "<<pc<<" dim";
+    lapis::DAry y,z, x({4});
+    LOG(ERROR)<<"fetch full b, c";
+    y=b.Fetch(sliceb);
+    z=c.Fetch(slicec);
+    LOG(ERROR)<<y.ToString();
+    LOG(ERROR)<<z.ToString();
+    LOG(ERROR)<<"fetch sub, sb[2], sc[3]";
+    y=sb.Fetch(slice);
+    z=sc.Fetch(slice);
+    LOG(ERROR)<<y.ToString();
+    LOG(ERROR)<<z.ToString();
+  }
+  ARMCI_Barrier();
+  a.Add(sb,sc);
+  ARMCI_Barrier();
+  //Debug();
+  if(rank==0){
+    lapis::DAry z;
+    z=a.Fetch(slice);
+    LOG(ERROR)<<"sub add fetch, sb[2]+sc[3]";
+    LOG(ERROR)<<z.ToString();
+  }
+}
+
+void TestReshape(int pa, int pb, int pc, int rank){
+  vector<lapis::Range> sliceb3{make_pair(0,2),make_pair(0,4), make_pair(0,4)};
+  vector<lapis::Range> sliceb{make_pair(0,8), make_pair(0,4)};
+  vector<lapis::Range> slicec{make_pair(0,4), make_pair(0,4)};
+  vector<lapis::Range> slicea{make_pair(0,4)};
+  lapis::DAry a,b,c,b3,b2,b1;
+  a.SetShape({4});
+  b.SetShape({8,4});
+  c.SetShape({4,4});
+  a.Setup(pa);
+  b.Setup(pb);
+  c.Setup(pc);
+  b.Random();
+  c.Random();
+
+  b3=b.Reshape({2,4,4});
+  //Debug() ;
+  b2=b3[1];
+  if(rank==0){
+    LOG(ERROR)<<"test reshape+subary, partition for a, b, c : "
+      << pa<<" "<<pb<<" "<<pc<<" dim";
+    lapis::DAry y,z,x;
+    LOG(ERROR)<<"fetch b, b2, c";
+    y=b.Fetch(sliceb);
+    z=b2.Fetch(slicec);
+    x=c.Fetch(slicec);
+    LOG(ERROR)<<y.ToString();
+    LOG(ERROR)<<z.ToString();
+    LOG(ERROR)<<x.ToString();
+    LOG(ERROR)<<"fetch sub, b2+c";
+    z.Add(x);
+    LOG(ERROR)<<z.ToString();
+  }
+
+  ARMCI_Barrier();
+  c.Add(b2);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry y,z,x;
+    x=c.Fetch(slicec);
+    LOG(ERROR)<<"sub add,fetch c+b2";
+    LOG(ERROR)<<x.ToString();
+  }
+  ARMCI_Barrier();
+  b2.Add(c);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry y,z,x;
+    x=b2.Fetch(slicec);
+    LOG(ERROR)<<"sub add,fetch b2+c";
+    LOG(ERROR)<<x.ToString();
+  }
+  ARMCI_Barrier();
+  b1=b2[2];
+  if(rank==0){
+    lapis::DAry y,z,x;
+    x=b1.Fetch(slicea);
+    LOG(ERROR)<<"fetch b1";
+    LOG(ERROR)<<x.ToString();
+  }
+
+  a.Add(b1);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry y,z,x;
+    x=a.Fetch(slicea);
+    LOG(ERROR)<<"add fetch a+b1";
+    LOG(ERROR)<<x.ToString();
+  }
+  ARMCI_Barrier();
+  b1.Add(a);
+  ARMCI_Barrier();
+  if(rank==0){
+    lapis::DAry y,z,x;
+    x=b1.Fetch(slicea);
+    LOG(ERROR)<<"add fetch b1+a";
+    LOG(ERROR)<<x.ToString();
+  }
+
+  ARMCI_Barrier();
+  {
+    lapis::DAry b3=b.Reshape({4,2,4});
+    lapis::DAry a;
+    a.SetShape({2,4});
+    a.Setup(pa);
+    a.Random();
+    lapis::DAry b1=b3[1];
+    lapis::DAry b2=b3[3];
+    lapis::DAry c;
+    c.SetShape({2,2});
+    c.Setup(pc);
+    ARMCI_Barrier();
+    c.Dot(a,b2,false, true);
+    ARMCI_Barrier();
+    if(rank==0){
+      lapis::DAry x,y,z,zz({2,2});
+      y=b3.Fetch({make_pair(0,4), make_pair(0,2), make_pair(0,4)});
+      x=a.Fetch({make_pair(0,2), make_pair(0,4)});
+      LOG(ERROR)<<"fetch b,a";
+      LOG(ERROR)<<y.ToString();
+      LOG(ERROR)<<x.ToString();
+      z=y[3];
+      zz.Dot(x,z,false, true);
+      LOG(ERROR)<<"fetch dot c=a*b[3]^T";
+      LOG(ERROR)<<zz.ToString();
+
+      x=a.Fetch({make_pair(0,2), make_pair(0,4)});
+      y=b2.Fetch({make_pair(0,2), make_pair(0,4)});
+      z=c.Fetch({make_pair(0,2), make_pair(0,2)});
+      LOG(ERROR)<<"op fetch c=a*b[3]^T";
+      LOG(ERROR)<<x.ToString();
+      LOG(ERROR)<<y.ToString();
+      LOG(ERROR)<<z.ToString();
+
+    }
+    ARMCI_Barrier();
+  }
+}
+
+
+
+int main(int argc, char**argv){
+ // MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+  MPI_Init(&argc, &argv);
+  int rank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  vector<int> procs;
+  for (int i = 0; i < nprocs; i++) {
+    procs.push_back(i);
+  }
+  //Debug();
+  lapis::GAry::Init(rank,procs);
+  google::InitGoogleLogging(argv[0]);
+  /*
+  if(nprocs%3==0){
+    TestMixedParElt(0,0,0,rank);
+    TestMixedParElt(0,0,1,rank);
+    TestMixedParElt(0,1,0,rank);
+    TestMixedParElt(1,0,0,rank);
+    TestMixedParElt(1,1,0,rank);
+    TestMixedParElt(1,1,1,rank);
+    TestMixedParElt(0,1,1,rank);
+  }
+  if(nprocs%2==0){
+    TestMixedParElt(1,1,1,rank);
+    TestMixedParElt(1,2,1,rank);
+    TestMixedParElt(2,1,1,rank);
+    TestMixedParElt(1,1,2,rank);
+    TestMixedParElt(2,2,2,rank);
+  }
+  TestDot(0,0,0,rank);
+  TestDot(0,0,1,rank);
+  TestDot(0,1,0,rank);
+  TestDot(0,1,1,rank);
+  TestDot(1,0,0,rank);
+  TestDot(1,0,1,rank);
+  TestDot(1,1,0,rank);
+  TestDot(1,1,1,rank);
+
+  TestPar(0, rank);
+  TestPar(1, rank);
+  */
+  double start, end;
+  start=MPI_Wtime();
+  TestLargeDot(0,0,0,rank);
+  TestLargeDot(0,0,1,rank);
+  TestLargeDot(0,1,0,rank);
+  TestLargeDot(0,1,1,rank);
+  TestLargeDot(1,0,0,rank);
+  TestLargeDot(1,0,1,rank);
+  TestLargeDot(1,1,0,rank);
+  TestLargeDot(1,1,1,rank);
+  end=MPI_Wtime();
+  if(rank==0)
+    LOG(ERROR)<<"dot time for 256*4k 4k*4k matrix, "<<end-start;
+  /*
+  TestSubarray(0,0,0,rank);
+  TestSubarray(0,0,1,rank);
+  TestSubarray(0,1,0,rank);
+  TestSubarray(0,1,1,rank);
+  TestReshape(0,0,0,rank);
+  TestReshape(0,0,1,rank);
+  TestReshape(0,1,0,rank);
+  TestReshape(0,1,1,rank);
+  */
+
+  LOG(ERROR)<<"finish";
+  lapis::GAry::Finalize();
+  MPI_Finalize();
+  return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_dary.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_dary.cc b/src/test/dist_test/test_dary.cc
new file mode 100644
index 0000000..ce605e6
--- /dev/null
+++ b/src/test/dist_test/test_dary.cc
@@ -0,0 +1,85 @@
+#include <iostream>
+#include "darray/dary.h"
+#include "utils/timer.h"
+
+
+int main() {
+  lapis::DAry x({1000000});
+  lapis::DAry y({1000000});
+  x.Random();
+  y.Random();
+  lapis::Timer t;
+  for(int i=0;i<100;i++){
+    float *dptrx=x.dptr();
+    float *dptry=y.dptr();
+    for(int k=0;k<10000;k++)
+      dptrx[k]*=dptry[k];
+  }
+  std::cout<<"arymath: "<<t.elapsed()/10<<std::endl;
+  lapis::DAry m({1000000});
+  lapis::DAry n({1000000});
+  m.Random();
+  n.Random();
+  t.Reset();
+  for(int i=0;i<100;i++)
+    m.Mult(m,n);
+  std::cout<<"arymath: "<<t.elapsed()/10<<std::endl;
+
+
+  lapis::DAry a({2,2});
+  lapis::DAry b,c;
+  b.InitLike(a);
+  c.InitLike(a);
+  a.Random();
+  b.Random();
+  std::cout<<a.ToString()<<std::endl;
+  std::cout<<b.ToString()<<std::endl;
+  c.Dot(a,b);
+  std::cout<<"c=a.b"<<c.ToString()<<std::endl;
+  a.Add(b);
+  std::cout<<"a=a+b"<<a.ToString()<<std::endl;
+  a.Mult(a,b);
+  std::cout<<"a=a*b"<<a.ToString()<<std::endl;
+  a.Minus(a,b);
+  std::cout<<"a=a-b"<<a.ToString()<<std::endl;
+
+  c.Random();
+  std::cout<<"random c "<<c.ToString()<<std::endl;
+  a.Threshold(c, 0.3);
+  std::cout<<"a=threshold(c,0.3) "<<a.ToString()<<std::endl;
+
+  a.Pow(c, 0.4);
+  std::cout<<"a=Pow(c,0.4) "<<a.ToString()<<std::endl;
+
+  c.Set(0.5);
+  std::cout<<"c=set(0.5) "<<c.ToString()<<std::endl;
+  a.Square(c);
+  std::cout<<"a=square(c) "<<a.ToString()<<std::endl;
+
+  c.Copy(a);
+  std::cout<<"c=Copy(a) "<<c.ToString()<<std::endl;
+
+  lapis::DAry d({2});
+  d.SumRow(b);
+  std::cout<<"d=SumRow(b) "<<d.ToString()<<std::endl;
+  d.SumCol(b);
+  std::cout<<"d=SumCol(b) "<<d.ToString()<<std::endl;
+  b.AddRow(d);
+  std::cout<<"b=AddRow(d) "<<b.ToString()<<std::endl;
+  b.AddCol(d);
+  std::cout<<"b=AddCol(d) "<<b.ToString()<<std::endl;
+
+  std::cout<<"max(b) "<<b.Max()<<std::endl;
+  std::cout<<"Sum(b) "<<b.Sum()<<std::endl;
+
+  lapis::DAry e({3,3,3});
+  e.SampleGaussian(0.0f,1.0f);
+  std::cout<<"Gaussain e "<<e.ToString()<<std::endl;
+
+  lapis::DAry f({9});
+  f.Sum(e, 0, {0,2});
+  std::cout<<"f.sum  "<<f.ToString()<<std::endl;
+
+  return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_disk_table.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_disk_table.cc b/src/test/dist_test/test_disk_table.cc
new file mode 100644
index 0000000..99987bb
--- /dev/null
+++ b/src/test/dist_test/test_disk_table.cc
@@ -0,0 +1,188 @@
+//  Copyright © 2014 Anh Dinh. All Rights Reserved.
+//  main class for testing distributed memory layer
+//
+//  the command to run this should be:
+//		mpirun -hostfile <host> -bycore -nooversubscribe
+//				-n <num_servers> test -sync_update
+
+
+#include "core/global-table.h"
+#include "core/common.h"
+#include "core/disk-table.h"
+#include "core/table.h"
+#include "core/table_server.h"
+#include "utils/global_context.h"
+#include <gflags/gflags.h>
+#include "proto/model.pb.h"
+#include "worker.h"
+#include <cmath>
+
+DEFINE_int32(record_size,100, "# elements per float vector");
+DECLARE_int32(block_size);
+DEFINE_int32(table_size, 1000, "# records per table");
+DEFINE_string(system_conf, "examples/imagenet12/system.conf", "configuration file for node roles");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf", "DL model configuration file");
+DEFINE_bool(is_testing_put,true, "data put vs. data get");
+DECLARE_int32(debug_index);
+DECLARE_int32(table_buffer);
+using namespace lapis;
+
+typedef map<int, GlobalTable*> Map;
+Map tables;
+
+//  put random message to the pointers
+void create_random_message(FloatVector* message, const int count){
+	for (int i=0; i<FLAGS_record_size; i++){
+		message->add_data(count*FLAGS_record_size+i);
+	}
+}
+
+void create_disk_table(int id){
+	DiskTableDescriptor *info = new DiskTableDescriptor(id, "disk_test",
+			FLAGS_block_size);
+	info->key_marshal = new Marshal<int>();
+	info->value_marshal = new Marshal<FloatVector>();
+	tables[id] = new TypedDiskTable<int,FloatVector>(info);
+}
+
+
+//  if testing put, write and send data. Else do nothing
+void run_coordinator(shared_ptr<NetworkThread> network, int tid){
+	// wait for wokers to be up
+	RegisterWorkerRequest req;
+	for (int i=0; i<network->size()-1; i++)
+		network->Read(MPI::ANY_SOURCE, MTYPE_REGISTER_WORKER, &req);
+
+	// put data in
+	TypedDiskTable<int, FloatVector>* table = static_cast<TypedDiskTable<int,
+			FloatVector>*>(tables[tid]);
+
+	//  if testing put()
+	if (FLAGS_is_testing_put) {
+		int count = 0;
+		for (int i = 0; i < FLAGS_table_size; i++) {
+			FloatVector message;
+			create_random_message(&message, i);
+			table->put(i, message);
+			count += message.ByteSize();
+		}
+		table->finish_put();
+	}
+
+	VLOG(3) << "Coordinator about to shut down";
+	for (int i=0; i<network->size()-1; i++){
+		EmptyMessage end_msg;
+		network->Read(i,MTYPE_WORKER_END, &end_msg);
+	}
+
+	EmptyMessage shutdown_msg;
+	for (int i = 0; i < network->size() - 1; i++) {
+		network->Send(i, MTYPE_WORKER_SHUTDOWN, shutdown_msg);
+	}
+	network->Flush();
+	network->Shutdown();
+	table->PrintStats();
+
+	if (FLAGS_is_testing_put) {
+		int sub_blocks = ceil(((double) FLAGS_table_size / FLAGS_table_buffer));
+		CHECK_EQ(table->stats()["total sub block sent"], sub_blocks);
+		CHECK_EQ(table->stats()["total record sent"], FLAGS_table_size);
+		VLOG(3) << "test coordinator sending: successful";
+	}
+
+}
+
+//  if testing put(), do nothing. Else read() until done()
+void run_worker(shared_ptr<NetworkThread> network, int tid){
+	TableServer* ts = new TableServer();
+	ts->StartTableServer(tables);
+
+	// put data in
+	TypedDiskTable<int, FloatVector>* table = static_cast<TypedDiskTable<int,
+			FloatVector>*>(tables[tid]);
+	double total_read = 0;
+	if (!FLAGS_is_testing_put){
+		VLOG(3) << "testing read from table ...";
+		table->Load();
+		while (!table->done()){
+			int k;
+			FloatVector v;
+			table->get(&k,&v);
+			table->Next();
+			total_read++;
+		}
+
+		int k;
+		FloatVector v;
+		table->get(&k, &v);
+		total_read++;
+	}
+
+	int size = network->size();
+
+	network->Flush();
+	network->Send(GlobalContext::kCoordinatorRank, MTYPE_WORKER_END,
+			EmptyMessage());
+	EmptyMessage msg;
+
+	int src = 0;
+	network->Read(GlobalContext::kCoordinatorRank, MTYPE_WORKER_SHUTDOWN, &msg,
+			&src);
+	network->Flush();
+	network->Shutdown();
+
+	Stats stats =
+			(static_cast<TypedDiskTable<int, FloatVector>*>(tables[0]))->stats();
+
+	if (FLAGS_is_testing_put) {
+		int sub_blocks = ceil(((double) FLAGS_table_size / FLAGS_table_buffer));
+		if (size == 2) {
+			CHECK_EQ(stats["total sub block received"], sub_blocks);
+			CHECK_EQ(stats["total record stored"], FLAGS_table_size);
+		}
+		VLOG(3) << "test table-server writing: successful";
+		VLOG(3) << "number of sub blocks = " << sub_blocks;
+		VLOG(3) << "total data stored = " << stats["total byte stored"];
+	}
+	else{
+		if (size==2)
+			CHECK_EQ(stats["total record read"], FLAGS_table_size);
+		VLOG(3) << "test table-server reading: successful";
+		VLOG(3) << "read bandwidth = "
+				<< (stats["total byte read"]
+						/ (stats["last byte read"] - stats["first byte read"]));
+		//VLOG(3) << "total number of record read = " << stats["total record read"];
+	}
+
+	network->PrintStats();
+	static_cast<TypedDiskTable<int, FloatVector>*>(tables[0])->PrintStats();
+}
+
+//  check all the records have been stored to disk
+int test_disk(int tid) {
+	// Init GlobalContext
+	auto gc = lapis::GlobalContext::Get(FLAGS_system_conf, FLAGS_model_conf);
+	//start network thread
+	shared_ptr<NetworkThread> network = NetworkThread::Get();
+
+	if (network->id() == network->size() - 1)
+		run_coordinator(network, tid);
+	else
+		run_worker(network,tid);
+	return 0;
+}
+
+// for debugging use
+//#ifndef FLAGS_v
+//  DEFINE_int32(v, 3, "vlog controller");
+//#endif
+
+int main(int argc, char **argv) {
+	FLAGS_logtostderr = 1;
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+	create_disk_table(0);
+	return test_disk(0);
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_mnistlayer.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_mnistlayer.cc b/src/test/dist_test/test_mnistlayer.cc
new file mode 100644
index 0000000..882e121
--- /dev/null
+++ b/src/test/dist_test/test_mnistlayer.cc
@@ -0,0 +1,165 @@
+#include <gtest/gtest.h>
+#include <sys/stat.h>
+#include <cstdint>
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+
+#include "model/layer.h"
+#include "proto/model.pb.h"
+#include "utils/shard.h"
+using namespace singa;
+TEST(MnistLayerTest, SingleScale){
+  LayerProto proto;
+  MnistProto *mnist=proto.mutable_mnist_param();
+  mnist->set_size(55);
+  MnistImageLayer layer;
+  layer.FromProto(proto);
+  cv::Mat image;
+  image=cv::imread("src/test/data/mnist.png", 0);
+  string pixel;
+  pixel.resize(image.rows*image.cols);
+  for(int i=0,k=0;i<image.rows;i++)
+    for(int j=0; j<image.cols;j++)
+      pixel[k++]=static_cast<char>(image.at<uint8_t>(i,j));
+  Record rec;
+  rec.set_type(Record_Type_kMnist);
+  MnistRecord *mrec=rec.mutable_mnist();
+  mrec->set_pixel(pixel);
+  layer.Setup(1, rec, kNone);
+  layer.AddInputRecord(rec);
+
+  const vector<uint8_t>& dat=layer.Convert2Image(0);
+  int s=static_cast<int>(sqrt(dat.size()));
+  cv::Mat newimg(s,s,CV_8UC1);
+  int count=0;
+  for(int i=0,k=0;i<newimg.rows;i++)
+    for(int j=0; j<newimg.cols;j++){
+      count+=dat[k]>0;
+      newimg.at<uint8_t>(i,j)=dat[k++];
+    }
+  //LOG(ERROR)<<"image positive "<<count<<" size "<<s;
+  cv::imwrite("src/test/data/mnist_scale.png", newimg);
+}
+
+TEST(MnistLayerTest, SingleAffineTransform){
+  LayerProto proto;
+  MnistProto *mnist=proto.mutable_mnist_param();
+  mnist->set_beta(15);
+  mnist->set_gamma(16);
+  mnist->set_size(55);
+  MnistImageLayer layer;
+  layer.FromProto(proto);
+  cv::Mat image;
+  image=cv::imread("src/test/data/mnist.png", 0);
+  string pixel;
+  pixel.resize(image.rows*image.cols);
+  for(int i=0,k=0;i<image.rows;i++)
+    for(int j=0; j<image.cols;j++)
+      pixel[k++]=static_cast<char>(image.at<uint8_t>(i,j));
+  Record rec;
+  rec.set_type(Record_Type_kMnist);
+  MnistRecord *mrec=rec.mutable_mnist();
+  mrec->set_pixel(pixel);
+  layer.Setup(1, rec, kNone);
+  layer.AddInputRecord(rec);
+
+  const vector<uint8_t>& dat=layer.Convert2Image(0);
+  int s=static_cast<int>(sqrt(dat.size()));
+  cv::Mat newimg(s,s,CV_8UC1);
+  int count=0;
+  for(int i=0,k=0;i<newimg.rows;i++)
+    for(int j=0; j<newimg.cols;j++){
+      count+=dat[k]>0;
+      newimg.at<uint8_t>(i,j)=dat[k++];
+    }
+  //LOG(ERROR)<<"image positive "<<count<<" size "<<s;
+
+  cv::imwrite("src/test/data/mnist_affine.png", newimg);
+}
+TEST(MnistLayerTest, SingleElasticDistortion){
+  LayerProto proto;
+  MnistProto *mnist=proto.mutable_mnist_param();
+  mnist->set_elastic_freq(1);
+  mnist->set_sigma(6);
+  mnist->set_alpha(36);
+  mnist->set_beta(15);
+  mnist->set_gamma(16);
+  mnist->set_size(55);
+  mnist->set_kernel(21);
+  MnistImageLayer layer;
+  layer.FromProto(proto);
+  cv::Mat image;
+  image=cv::imread("src/test/data/mnist.png", 0);
+  string pixel;
+  pixel.resize(image.rows*image.cols);
+  for(int i=0,k=0;i<image.rows;i++)
+    for(int j=0; j<image.cols;j++)
+      pixel[k++]=static_cast<char>(image.at<uint8_t>(i,j));
+  Record rec;
+  rec.set_type(Record_Type_kMnist);
+  MnistRecord *mrec=rec.mutable_mnist();
+  mrec->set_pixel(pixel);
+  layer.Setup(1, rec, kNone);
+  layer.AddInputRecord(rec);
+
+  const vector<uint8_t>& dat=layer.Convert2Image(0);
+  int s=static_cast<int>(sqrt(dat.size()));
+  cv::Mat newimg(s,s,CV_8UC1);
+  int count=0;
+  for(int i=0,k=0;i<newimg.rows;i++)
+    for(int j=0; j<newimg.cols;j++){
+      count+=dat[k]>0;
+      newimg.at<uint8_t>(i,j)=dat[k++];
+    }
+  cv::imwrite("src/test/data/mnist_elastic.png", newimg);
+}
+TEST(MnistLayerTest, MultElasticDistortion){
+  LayerProto proto;
+  MnistProto *mnist=proto.mutable_mnist_param();
+  int kTotal=100;
+  int kSize=29;
+  mnist->set_elastic_freq(kTotal);
+  mnist->set_sigma(6);
+  mnist->set_alpha(36);
+  mnist->set_beta(15);
+  mnist->set_gamma(16);
+  mnist->set_size(kSize);
+  mnist->set_kernel(21);
+  MnistImageLayer layer;
+  layer.FromProto(proto);
+  vector<vector<int>> shapes{{kTotal, kSize,kSize}};
+  layer.Setup(shapes, kNone);
+  shard::Shard source("/data1/wangwei/singa/data/mnist/test/",shard::Shard::kRead);
+  int n=static_cast<int>(sqrt(kTotal));
+  cv::Mat origin(n*28,n*28, CV_8UC1);
+  char disp[1024];
+  for(int x=0;x<n;x++){
+    sprintf(disp+strlen(disp), "\n");
+    for(int y=0;y<n;y++){
+      Record rec;
+      string key;
+      CHECK(source.Next(&key, &rec));
+      const string pixel=rec.mnist().pixel();
+      cv::Mat img=origin(cv::Rect(y*28, x*28, 28, 28));
+      for(int i=0,k=0;i<28;i++)
+        for(int j=0;j<28;j++)
+          img.at<uint8_t>(i,j)=static_cast<uint8_t>(pixel[k++]);
+      layer.AddInputRecord(rec);
+      sprintf(disp+strlen(disp), "%d ", rec.mnist().label());
+    }
+  }
+  LOG(ERROR)<<disp;
+  cv::imwrite("src/test/data/mnist_big.png", origin);
+
+  cv::Mat output(n*kSize,n*kSize, CV_8UC1);
+  for(int i=0;i<kTotal;i++){
+    const vector<uint8_t>& dat=layer.Convert2Image(i);
+    int x=(i/n);
+    int y=i%n;
+    cv::Mat img=output(cv::Rect(y*kSize, x*kSize, kSize, kSize));
+    for(int i=0,k=0;i<kSize;i++)
+      for(int j=0;j<kSize;j++)
+        img.at<uint8_t>(i,j)=dat[k++];
+  }
+  cv::imwrite("src/test/data/mnist_bigout.png", output);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_model.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_model.cc b/src/test/dist_test/test_model.cc
new file mode 100644
index 0000000..c3f98b9
--- /dev/null
+++ b/src/test/dist_test/test_model.cc
@@ -0,0 +1,25 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-08-02 14:13
+#include <glog/logging.h>
+#include <gflags/gflags.h>
+
+
+#include "model/sgd_trainer.h"
+#include "model/net.h"
+#include "proto/model.pb.h"
+#include "utils/proto_helper.h"
+
+DEFINE_int32(v, 1, "vlog");
+
+int main(int argc, char** argv) {
+  FLAGS_logtostderr=1;
+  google::InitGoogleLogging(argv[0]);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  lapis::ModelProto model_proto;
+  lapis::ReadProtoFromTextFile("examples/imagenet12/model.conf", &model_proto);
+  lapis::SGDTrainer trainer;
+  trainer.Init(model_proto.trainer());
+  lapis::Net net;
+  net.Init(model_proto.net());
+  trainer.Run(&net);
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_neuralnet.cc b/src/test/dist_test/test_neuralnet.cc
new file mode 100644
index 0000000..a857124
--- /dev/null
+++ b/src/test/dist_test/test_neuralnet.cc
@@ -0,0 +1,141 @@
+#include <gtest/gtest.h>
+#include <model/neuralnet.h>
+#include "proto/model.pb.h"
+#include "utils/common.h"
+#include "utils/param_updater.h"
+
+using namespace singa;
+NetProto CreateMLPProto(){
+  ModelProto model;
+  ReadProtoFromTextFile("examples/mnist/mlp.conf", &model);
+  return model.neuralnet();
+}
+TEST(NeuralnetTest, BP){
+  ModelProto model;
+  ReadProtoFromTextFile("examples/mnist/mlp.conf", &model);
+
+  AdaGradUpdater updater;
+  updater.Init(model.solver().updater());
+
+  NeuralNet net(model.neuralnet());
+  auto layers=net.layers();
+  for(int i=0;i<3;i++){
+    bool firstlayer=true;
+    for(auto& layer: layers){
+      layer->ComputeFeature();
+      if(firstlayer){
+        DataLayer* dl=static_cast<DataLayer*>(layer.get());
+        dl->CompletePrefetch();
+        firstlayer=false;
+      }
+    }
+
+    for(int k=layers.size()-1;k>=0;k--){
+      layers[k]->ComputeGradient();
+      for(Param* param: layers[k]->GetParams())
+        updater.Update(i, param);
+    }
+  }
+}
+NetProto CreateConvNetProto(){
+  NetProto proto;
+  LayerProto *layer;
+
+  layer=proto.add_layer();
+  layer->set_name("data");
+  layer->set_type("kShardData");
+  DataProto *data=layer->mutable_data_param();
+  data->set_batchsize(8);
+  data->set_path("/data1/wangwei/singa/data/mnist/train/");
+
+  // 4x3x10x10
+  layer=proto.add_layer();
+  layer->set_name("mnist");
+  layer->set_type("kMnistImage");
+  layer->add_srclayers("data");
+
+  // 4x1
+  layer=proto.add_layer();
+  layer->set_name("label");
+  layer->set_type("kLabel");
+  layer->add_srclayers("data");
+
+  // 4x8x9x9
+  layer=proto.add_layer();
+  layer->set_name("conv1");
+  layer->set_type("kConvolution");
+  layer->add_srclayers("mnist");
+  layer->add_param();
+  layer->add_param();
+  ConvolutionProto *conv=layer->mutable_convolution_param();
+  conv->set_num_filters(8);
+  conv->set_kernel(2);
+
+  // 4x8x9x9
+  layer=proto.add_layer();
+  layer->set_name("relu1");
+  layer->set_type("kReLU");
+  layer->add_srclayers("conv1");
+
+  // 4x8x4x4
+  layer=proto.add_layer();
+  layer->set_name("pool1");
+  layer->set_type("kPooling");
+  layer->add_srclayers("relu1");
+  PoolingProto *pool=layer->mutable_pooling_param();
+  pool->set_kernel(4);
+  pool->set_stride(2);
+
+  // 4x10
+  layer=proto.add_layer();
+  layer->set_name("fc1");
+  layer->set_type("kInnerProduct");
+  layer->add_srclayers("pool1");
+  layer->add_param();
+  layer->add_param();
+  InnerProductProto *inner=layer->mutable_inner_product_param();
+  inner->set_num_output(10);
+
+  // 4x10
+  layer=proto.add_layer();
+  layer->set_name("loss");
+  layer->set_type("kSoftmaxLoss");
+  layer->add_srclayers("fc1");
+  layer->add_srclayers("label");
+
+  return proto;
+}
+
+TEST(NeuralNetTest, NoPartition){
+  NetProto proto=CreateConvNetProto();
+  NeuralNet net(proto);
+  const auto& layers=net.layers();
+  ASSERT_EQ(8, layers.size());
+  ASSERT_EQ("data", layers.at(0)->name());
+  ASSERT_EQ("loss", layers.at(7)->name());
+}
+
+TEST(NeuralNetTest, DataPartition){
+  NetProto proto=CreateConvNetProto();
+  proto.set_partition_type(kDataPartition);
+  NeuralNet net(proto, 3);
+  const auto& layers=net.layers();
+  ASSERT_EQ(28, layers.size());
+  ASSERT_EQ("data", layers.at(0)->name());
+}
+TEST(NeuralNetTest, LayerPartition){
+  NetProto proto=CreateConvNetProto();
+  proto.set_partition_type(kLayerPartition);
+  NeuralNet net(proto, 2);
+ // const auto& layers=net.layers();
+}
+TEST(NeuralNetTest, HyridPartition){
+  NetProto proto=CreateConvNetProto();
+  int num_layers=proto.layer_size();
+  proto.mutable_layer(num_layers-2)->set_partition_type(kDataPartition);
+  proto.mutable_layer(num_layers-1)->set_partition_type(kDataPartition);
+  proto.set_partition_type(kLayerPartition);
+  NeuralNet net(proto, 2);
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_pm.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_pm.cc b/src/test/dist_test/test_pm.cc
new file mode 100644
index 0000000..67c210a
--- /dev/null
+++ b/src/test/dist_test/test_pm.cc
@@ -0,0 +1,88 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <iostream>
+#include <fstream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "utils/cluster.h"
+#include "utils/common.h"
+#include "proto/model.pb.h"
+#include "proto/cluster.pb.h"
+#include "server/server.h"
+#include "server/pm_server.h"
+#include "worker/pm_client.h"
+#include "worker/worker.h"
+#include "proto/topology.pb.h"
+#include <string.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+
+using namespace google::protobuf::io;
+using google::protobuf::TextFormat;
+
+using std::ifstream;
+
+/**
+ * Testing put/get/update performance of the new zeromq-based parameter
+ * servers.
+ */
+DEFINE_int32(procsID, 0, "global process ID");
+DEFINE_string(hostfile, "examples/imagenet12/hostfile", "hostfile");
+DEFINE_string(cluster_conf, "examples/imagenet12/cluster.conf",
+    "configuration file for the cluster");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf",
+    "Deep learning model configuration file");
+
+DEFINE_string(topology_config,"examples/imagenet12/topology.conf", "Network of servers");
+DEFINE_int32(server_threads,1,"Number of server's worker threads per process");
+DEFINE_int32(client_threads,1,"Number of client's worker threads per process");
+
+DEFINE_string(mode, "client", "client or server mode");
+DEFINE_int32(node_id, 0, "ID of the node, client or server");
+DEFINE_int32(primary_set, 0, "ID of the primary server set (for client mode only)");
+
+/**
+ *
+ * Read the topology file in, and start the Client or server respectively.
+ *
+ * test_pm --node_id <id>
+ */
+
+
+#ifndef FLAGS_v
+  DEFINE_int32(v, 3, "vlog controller");
+#endif
+
+int main(int argc, char **argv) {
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+	FLAGS_logtostderr = 1;
+
+
+	//Read in the topology file
+	int fd = open(FLAGS_topology_config.c_str(), O_RDONLY);
+	assert(fd != -1);
+	singa::Topology topology;
+	TextFormat::Parse(new FileInputStream(fd), &topology);
+
+
+	//read host file
+	ifstream hostfile(FLAGS_hostfile.c_str());
+	string host;
+	vector<string> hosts;
+	while (getline(hostfile, host))
+		hosts.push_back(host);
+	
+	if (FLAGS_node_id < topology.nservers()) {
+		singa::SingaServer *server = new singa::SingaServer(FLAGS_node_id, topology, hosts);
+		server->StartServer();
+	} else {
+		singa::SingaClient *client = new singa::SingaClient(FLAGS_node_id, topology, hosts);
+		client->StartClient();
+	}
+	
+	return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_router.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_router.cc b/src/test/dist_test/test_router.cc
new file mode 100644
index 0000000..bed3d99
--- /dev/null
+++ b/src/test/dist_test/test_router.cc
@@ -0,0 +1,27 @@
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "utils/router.h"
+#include "utils/common.h"
+#include "utils/cluster.h"
+DEFINE_string(hostfile, "examples/imagenet12/hostfile", "hostfile");
+DEFINE_string(cluster_conf, "examples/imagenet12/cluster.conf",
+    "configuration file for the cluster");
+DEFINE_int32(procsID, 0, "global process ID");
+
+int main(int argc, char** argv){
+  google::InitGoogleLogging(argv[0]);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Init Cluster
+  singa::ClusterProto pcluster;
+  singa::ReadProtoFromTextFile(FLAGS_cluster_conf.c_str(), &pcluster);
+  auto cluster=singa::Cluster::Get(pcluster, FLAGS_hostfile, FLAGS_procsID);
+  if(cluster->AmIServer()){
+    singa::Router server(5732);
+    CHECK(server.Bind(cluster->server_addr(0), cluster->nworkers()));
+  }else{
+    singa::Router worker(5732);
+    CHECK(worker.Connect(cluster->server_addr(0)));
+  }
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_split.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_split.cc b/src/test/dist_test/test_split.cc
new file mode 100644
index 0000000..674d546
--- /dev/null
+++ b/src/test/dist_test/test_split.cc
@@ -0,0 +1,304 @@
+//  Copyright © 2014 Anh Dinh. All Rights Reserved.
+
+
+//  Testing the unbalance in spliting parameter vectors.
+
+#include "core/global-table.h"
+#include "core/common.h"
+#include "core/disk-table.h"
+#include "core/table.h"
+#include "core/table_server.h"
+#include "utils/global_context.h"
+#include <gflags/gflags.h>
+#include "proto/model.pb.h"
+#include "worker.h"
+#include "coordinator.h"
+//#include "model_controller/myacc.h"
+#include "utils/common.h"
+
+#include <cmath>
+#include <stdlib.h>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+using namespace lapis;
+using std::vector;
+
+//DEFINE_bool(sync_update, false, "Synchronous put/update queue");
+DEFINE_string(system_conf, "examples/imagenet12/system.conf", "configuration file for node roles");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf", "DL model configuration file");
+DEFINE_int64(threshold,1000000, "max # of parameters in a vector");
+DEFINE_int32(iterations,5,"numer of get/put iterations");
+DEFINE_int32(workers,2,"numer of workers doing get/put");
+#ifndef FLAGS_v
+  DEFINE_int32(v, 3, "vlog controller");
+#endif
+
+typedef map<int, GlobalTable*> Map;
+Map tables;
+shared_ptr<NetworkThread> network;
+shared_ptr<GlobalContext> context;
+std::vector<ServerState*> server_states;
+TableServer *table_server;
+
+FloatVector large_msg, small_msg;
+const int SIZE=16;
+
+long sizes[] = { 37448736, 16777216, 4096000, 1327104, 884736, 884736, 614400,
+		14112, 4096, 4096, 1000, 384, 384, 256, 256, 96 };
+
+vector<FloatVector*> value_msg;
+
+int num_keys;
+
+// create large and small messages
+void init_messages(){
+	num_keys = 0;
+  long nservers=context->num_table_servers();
+	for (int i=0; i<SIZE; i++){
+		int total=0;
+    int threshold=std::max(FLAGS_threshold,0l);//, sizes[i]/nservers);
+    VLOG(3)<<"worker: "<<threshold;
+		while (total<sizes[i]){
+			FloatVector* fv = new FloatVector();
+			for (int j=0; j+total<sizes[i] && j<threshold; j++)
+				fv->add_data(static_cast<float>(rand())/static_cast<float>(RAND_MAX));
+			value_msg.push_back(fv);
+			total+=threshold;
+			num_keys++;
+		}
+	}
+}
+
+void create_mem_table(int id, int num_shards){
+
+	TableDescriptor *info = new TableDescriptor(id, num_shards);
+	  info->key_marshal = new Marshal<int>();
+	  info->value_marshal = new Marshal<FloatVector>();
+	  info->sharder = new Sharding::Mod;
+	  info->accum = new MyAcc();
+	  info->partition_factory = new typename SparseTable<int, FloatVector>::Factory;
+	  auto table=new TypedGlobalTable<int, FloatVector>();
+	  table->Init(info);
+	  tables[id] = table;
+}
+
+void coordinator_assign_tables(int id){
+	for (int i = 0; i < context->num_processes()-1; ++i) {
+	    RegisterWorkerRequest req;
+	    int src = 0;
+	    network->Read(MPI::ANY_SOURCE, MTYPE_REGISTER_WORKER, &req, &src);
+	    //  adding memory server.
+	    if (context->IsTableServer(i)) {
+	      server_states.push_back(new ServerState(i));
+	    }
+	  }
+	  LOG(INFO) << " All servers registered and started up. Ready to go";
+	  //  set itself as the current worker for the table
+	  tables[id]->worker_id_ = network->id();
+
+	  // memory servers are specified in global context. Round-robin assignment
+
+	    VLOG(3)<<"num of shards"<<tables[id]->num_shards()<<" for table"<< id;
+
+	    int server_idx = 0;
+	    for (int shard = 0; shard < tables[id]->num_shards(); ++shard) {
+	      ServerState &server = *server_states[server_idx];
+	      LOG(INFO) << "Assigning table ("<<id<<","<<shard<<") to server "
+	                <<server_states[server_idx]->server_id;
+
+	      // TODO(Anh) may overwrite this field if #shards>#table_servers
+	      server.shard_id = shard;
+	      server.local_shards.insert(new TaskId(id, shard));
+	      server_idx = (server_idx + 1) % server_states.size();
+	    }
+
+	  VLOG(3)<<"table assignment";
+	  //  then send table assignment
+	  ShardAssignmentRequest req;
+	  for (size_t i = 0; i < server_states.size(); ++i) {
+	    ServerState &server = *server_states[i];
+	    for (auto * task: server.local_shards) {
+	      ShardAssignment *s  = req.add_assign();
+	      s->set_new_worker(server.server_id);
+	      s->set_table(task->table);
+	      s->set_shard(task->shard);
+	      //  update local tables
+	      CHECK(tables.find(task->table)!=tables.end());
+	      GlobalTable *t = tables.at(task->table);
+	      t->get_partition_info(task->shard)->owner = server.server_id;
+	      delete task;
+	    }
+	  }
+	  VLOG(3)<<"finish table assignment, req size "<<req.assign_size();
+	  network->SyncBroadcast(MTYPE_SHARD_ASSIGNMENT, MTYPE_SHARD_ASSIGNMENT_DONE, req);
+	  VLOG(3)<<"finish table server init";
+}
+
+void worker_table_init(){
+	table_server = new TableServer();
+	table_server->StartTableServer(tables);
+	VLOG(3) << "done starting table server";
+}
+
+double random_double(){
+	return static_cast<double>(rand())/static_cast<double>(RAND_MAX);
+}
+
+// popular table with random large or small messages.
+// the message distribution specified in FLAGS_large_precentage
+void coordinator_load_data(){
+	auto table = static_cast<TypedGlobalTable<int,FloatVector>*>(tables[0]);
+
+	num_keys = 0;
+  int nservers=context->num_table_servers();
+	for (int i = 0; i < SIZE; i++) {
+		int total = 0;
+    int threshold=std::max(FLAGS_threshold,0l);//  sizes[i]/nservers);
+    while (total < sizes[i]) {
+      FloatVector* fv = new FloatVector();
+      for (int j = 0; j + total < sizes[i] && j < threshold; j++)
+        fv->add_data(
+            static_cast<float>(rand())
+            / static_cast<float>(RAND_MAX));
+      table->put(num_keys,*fv);
+      total += threshold;
+      num_keys++;
+    }
+	}
+	VLOG(3) << "Loaded data successfully ... " << num_keys << " messages";
+}
+
+void get(TypedGlobalTable<int,FloatVector>* table, ofstream &latency){
+	double start , end;
+  StateQueue<int> state(num_keys);
+  FloatVector v;
+  /*
+	for (int i=0; i<num_keys; i++){
+    start = Now();
+    table->get(i);
+    end=Now();
+    latency << "get: " << (end - start) << endl;
+  }
+  */
+  start=Now();
+	for (int i=0; i<num_keys; i++){
+    if(table->async_get(i, &v))
+      state.Invalid(i);
+	}
+  latency << "send get: " << (Now() - start) << endl;
+  start=Now();
+  while(state.HasValid()){
+    int key=state.Next();
+    if(table->async_get_collect(&key, &v))
+      state.Invalid(key);
+    sleep(0.001);
+  }
+  latency << "collect get: " << (Now() - start) << endl;
+}
+
+void update(TypedGlobalTable<int,FloatVector>* table, ofstream &latency){
+	double start, end;
+	for (int i=0; i<num_keys; i++){
+		start = Now();
+		table->update(i,*value_msg[i]);
+    end=Now();
+		latency << "update: " << (end - start) << endl;
+	}
+}
+
+void worker_test_data(){
+	init_messages();
+	auto table = static_cast<TypedGlobalTable<int,FloatVector>*>(tables[0]);
+
+	ofstream latency(StringPrintf("latency_%d",NetworkThread::Get()->id()));
+	ofstream throughput(StringPrintf("throughput_%d", NetworkThread::Get()->id()));
+	double start, end;
+	for (int i=0; i<FLAGS_iterations; i++){
+		start = Now();
+		get(table, latency);
+    end=Now();
+		throughput << "get: " << (end - start) << " over " << num_keys << " ops " << endl;
+		start = Now();
+		update(table, latency);
+    end=Now();
+		throughput << "update: " << (end - start) << " over " << num_keys << " ops " << endl;
+    sleep(10);
+	}
+	latency.close();
+	throughput.close();
+
+}
+
+void print_table_stats(){
+	auto table = static_cast<TypedGlobalTable<int,FloatVector>*>(tables[0]);
+	ofstream log_file(StringPrintf("log_variance_%d", NetworkThread::Get()->id()));
+	log_file << "table size at process "<< NetworkThread::Get()->id()<<" = " << table->stats()["TABLE_SIZE"] << endl;
+	log_file.close();
+}
+
+void shutdown(){
+	if (context->AmICoordinator()){
+		VLOG(3) << "Coordinator is shutting down ...";
+		EmptyMessage msg;
+		for (int i=0; i<context->num_processes()-1; i++)
+			network->Read(MPI::ANY_SOURCE, MTYPE_WORKER_END, &msg);
+		 EmptyMessage shutdown_msg;
+		  for (int i = 0; i < network->size() - 1; i++) {
+		    network->Send(i, MTYPE_WORKER_SHUTDOWN, shutdown_msg);
+		  }
+		  network->Flush();
+		  network->Shutdown();
+	}
+	else{
+		VLOG(3) << "Worker " << network->id() << " is shutting down ...";
+	  network->Flush();
+	  VLOG(3) << "Done flushing the network thread";
+	  network->Send(GlobalContext::kCoordinatorRank, MTYPE_WORKER_END, EmptyMessage());
+	  EmptyMessage msg;
+	  network->Read(GlobalContext::kCoordinatorRank, MTYPE_WORKER_SHUTDOWN, &msg);
+	  VLOG(3) << "Worker received MTYPE_WORKER_SHUTDOWN";
+
+	  table_server->ShutdownTableServer();
+	  VLOG(3) << "Flushing node " << network->id();
+	  network->Shutdown();
+	}
+}
+
+
+int main(int argc, char **argv) {
+	FLAGS_logtostderr = 1;
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+	context = GlobalContext::Get(FLAGS_system_conf, FLAGS_model_conf);
+	network = NetworkThread::Get();
+	VLOG(3) << "*** testing memory servers, with "
+			<< context->num_table_servers() << " servers";
+
+
+	create_mem_table(0,context->num_table_servers());
+
+  LOG(INFO)<<"threshold: "<<FLAGS_threshold<<" nworkers: "<<FLAGS_workers;
+	if (context->AmICoordinator()){
+		coordinator_assign_tables(0);
+		coordinator_load_data();
+		network->barrier();
+	}
+	else{
+		worker_table_init();
+		network->barrier();
+		VLOG(3) << "passed the barrier";
+		print_table_stats();
+
+		//Sleep(1);
+    if(network->id()<FLAGS_workers)
+      worker_test_data();
+	}
+
+	shutdown();
+	return 0;
+}
+
+


[04/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/proto/model.pb.h
----------------------------------------------------------------------
diff --git a/src/proto/model.pb.h b/src/proto/model.pb.h
new file mode 100644
index 0000000..bc4d952
--- /dev/null
+++ b/src/proto/model.pb.h
@@ -0,0 +1,8167 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: model.proto
+
+#ifndef PROTOBUF_model_2eproto__INCLUDED
+#define PROTOBUF_model_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 2005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 2005000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>
+#include <google/protobuf/extension_set.h>
+#include <google/protobuf/generated_enum_reflection.h>
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+
+namespace singa {
+
+// Internal implementation detail -- do not call these.
+void  protobuf_AddDesc_model_2eproto();
+void protobuf_AssignDesc_model_2eproto();
+void protobuf_ShutdownFile_model_2eproto();
+
+class ModelProto;
+class NetProto;
+class ParamProto;
+class BlobProtos;
+class LayerProto;
+class RGBImage;
+class SplitProto;
+class TanhProto;
+class SoftmaxLossProto;
+class ConvolutionProto;
+class ConcateProto;
+class DataProto;
+class MnistProto;
+class DropoutProto;
+class InnerProductProto;
+class LRNProto;
+class PoolingProto;
+class SliceProto;
+class ReLUProto;
+class Record;
+class Datum;
+class SingleLabelImageRecord;
+class UpdaterProto;
+class BlobProto;
+
+enum ModelProto_GradCalcAlg {
+  ModelProto_GradCalcAlg_kBackPropagation = 1,
+  ModelProto_GradCalcAlg_kContrastiveDivergence = 2
+};
+bool ModelProto_GradCalcAlg_IsValid(int value);
+const ModelProto_GradCalcAlg ModelProto_GradCalcAlg_GradCalcAlg_MIN = ModelProto_GradCalcAlg_kBackPropagation;
+const ModelProto_GradCalcAlg ModelProto_GradCalcAlg_GradCalcAlg_MAX = ModelProto_GradCalcAlg_kContrastiveDivergence;
+const int ModelProto_GradCalcAlg_GradCalcAlg_ARRAYSIZE = ModelProto_GradCalcAlg_GradCalcAlg_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* ModelProto_GradCalcAlg_descriptor();
+inline const ::std::string& ModelProto_GradCalcAlg_Name(ModelProto_GradCalcAlg value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    ModelProto_GradCalcAlg_descriptor(), value);
+}
+inline bool ModelProto_GradCalcAlg_Parse(
+    const ::std::string& name, ModelProto_GradCalcAlg* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<ModelProto_GradCalcAlg>(
+    ModelProto_GradCalcAlg_descriptor(), name, value);
+}
+enum ParamProto_InitMethod {
+  ParamProto_InitMethod_kConstant = 0,
+  ParamProto_InitMethod_kGaussian = 1,
+  ParamProto_InitMethod_kUniform = 2,
+  ParamProto_InitMethod_kPretrained = 3,
+  ParamProto_InitMethod_kGaussainSqrtFanIn = 4,
+  ParamProto_InitMethod_kUniformSqrtFanIn = 5,
+  ParamProto_InitMethod_kUniformSqrtFanInOut = 6
+};
+bool ParamProto_InitMethod_IsValid(int value);
+const ParamProto_InitMethod ParamProto_InitMethod_InitMethod_MIN = ParamProto_InitMethod_kConstant;
+const ParamProto_InitMethod ParamProto_InitMethod_InitMethod_MAX = ParamProto_InitMethod_kUniformSqrtFanInOut;
+const int ParamProto_InitMethod_InitMethod_ARRAYSIZE = ParamProto_InitMethod_InitMethod_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* ParamProto_InitMethod_descriptor();
+inline const ::std::string& ParamProto_InitMethod_Name(ParamProto_InitMethod value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    ParamProto_InitMethod_descriptor(), value);
+}
+inline bool ParamProto_InitMethod_Parse(
+    const ::std::string& name, ParamProto_InitMethod* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<ParamProto_InitMethod>(
+    ParamProto_InitMethod_descriptor(), name, value);
+}
+enum LRNProto_NormRegion {
+  LRNProto_NormRegion_ACROSS_CHANNELS = 0,
+  LRNProto_NormRegion_WITHIN_CHANNEL = 1
+};
+bool LRNProto_NormRegion_IsValid(int value);
+const LRNProto_NormRegion LRNProto_NormRegion_NormRegion_MIN = LRNProto_NormRegion_ACROSS_CHANNELS;
+const LRNProto_NormRegion LRNProto_NormRegion_NormRegion_MAX = LRNProto_NormRegion_WITHIN_CHANNEL;
+const int LRNProto_NormRegion_NormRegion_ARRAYSIZE = LRNProto_NormRegion_NormRegion_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* LRNProto_NormRegion_descriptor();
+inline const ::std::string& LRNProto_NormRegion_Name(LRNProto_NormRegion value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    LRNProto_NormRegion_descriptor(), value);
+}
+inline bool LRNProto_NormRegion_Parse(
+    const ::std::string& name, LRNProto_NormRegion* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<LRNProto_NormRegion>(
+    LRNProto_NormRegion_descriptor(), name, value);
+}
+enum PoolingProto_PoolMethod {
+  PoolingProto_PoolMethod_MAX = 0,
+  PoolingProto_PoolMethod_AVE = 1
+};
+bool PoolingProto_PoolMethod_IsValid(int value);
+const PoolingProto_PoolMethod PoolingProto_PoolMethod_PoolMethod_MIN = PoolingProto_PoolMethod_MAX;
+const PoolingProto_PoolMethod PoolingProto_PoolMethod_PoolMethod_MAX = PoolingProto_PoolMethod_AVE;
+const int PoolingProto_PoolMethod_PoolMethod_ARRAYSIZE = PoolingProto_PoolMethod_PoolMethod_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* PoolingProto_PoolMethod_descriptor();
+inline const ::std::string& PoolingProto_PoolMethod_Name(PoolingProto_PoolMethod value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    PoolingProto_PoolMethod_descriptor(), value);
+}
+inline bool PoolingProto_PoolMethod_Parse(
+    const ::std::string& name, PoolingProto_PoolMethod* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<PoolingProto_PoolMethod>(
+    PoolingProto_PoolMethod_descriptor(), name, value);
+}
+enum Record_Type {
+  Record_Type_kSingleLabelImage = 0
+};
+bool Record_Type_IsValid(int value);
+const Record_Type Record_Type_Type_MIN = Record_Type_kSingleLabelImage;
+const Record_Type Record_Type_Type_MAX = Record_Type_kSingleLabelImage;
+const int Record_Type_Type_ARRAYSIZE = Record_Type_Type_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* Record_Type_descriptor();
+inline const ::std::string& Record_Type_Name(Record_Type value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    Record_Type_descriptor(), value);
+}
+inline bool Record_Type_Parse(
+    const ::std::string& name, Record_Type* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<Record_Type>(
+    Record_Type_descriptor(), name, value);
+}
+enum UpdaterProto_ChangeProto {
+  UpdaterProto_ChangeProto_kFixed = 0,
+  UpdaterProto_ChangeProto_kInverse_t = 1,
+  UpdaterProto_ChangeProto_kInverse = 2,
+  UpdaterProto_ChangeProto_kExponential = 3,
+  UpdaterProto_ChangeProto_kLinear = 4,
+  UpdaterProto_ChangeProto_kStep = 5,
+  UpdaterProto_ChangeProto_kFixedStep = 6
+};
+bool UpdaterProto_ChangeProto_IsValid(int value);
+const UpdaterProto_ChangeProto UpdaterProto_ChangeProto_ChangeProto_MIN = UpdaterProto_ChangeProto_kFixed;
+const UpdaterProto_ChangeProto UpdaterProto_ChangeProto_ChangeProto_MAX = UpdaterProto_ChangeProto_kFixedStep;
+const int UpdaterProto_ChangeProto_ChangeProto_ARRAYSIZE = UpdaterProto_ChangeProto_ChangeProto_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* UpdaterProto_ChangeProto_descriptor();
+inline const ::std::string& UpdaterProto_ChangeProto_Name(UpdaterProto_ChangeProto value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    UpdaterProto_ChangeProto_descriptor(), value);
+}
+inline bool UpdaterProto_ChangeProto_Parse(
+    const ::std::string& name, UpdaterProto_ChangeProto* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<UpdaterProto_ChangeProto>(
+    UpdaterProto_ChangeProto_descriptor(), name, value);
+}
+enum MsgType {
+  kGet = 0,
+  kPut = 1,
+  kSync = 2,
+  kUpdate = 3,
+  kSyncRequest = 4,
+  kSyncResponse = 5,
+  kStop = 6,
+  kData = 7,
+  kRGet = 8,
+  kRUpdate = 9,
+  kConnect = 10
+};
+bool MsgType_IsValid(int value);
+const MsgType MsgType_MIN = kGet;
+const MsgType MsgType_MAX = kConnect;
+const int MsgType_ARRAYSIZE = MsgType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* MsgType_descriptor();
+inline const ::std::string& MsgType_Name(MsgType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    MsgType_descriptor(), value);
+}
+inline bool MsgType_Parse(
+    const ::std::string& name, MsgType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<MsgType>(
+    MsgType_descriptor(), name, value);
+}
+enum EntityType {
+  kWorkerParam = 0,
+  kWorkerLayer = 1,
+  kServer = 2,
+  kStub = 3
+};
+bool EntityType_IsValid(int value);
+const EntityType EntityType_MIN = kWorkerParam;
+const EntityType EntityType_MAX = kStub;
+const int EntityType_ARRAYSIZE = EntityType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* EntityType_descriptor();
+inline const ::std::string& EntityType_Name(EntityType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    EntityType_descriptor(), value);
+}
+inline bool EntityType_Parse(
+    const ::std::string& name, EntityType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<EntityType>(
+    EntityType_descriptor(), name, value);
+}
+enum Phase {
+  kTrain = 0,
+  kValidation = 1,
+  kTest = 2
+};
+bool Phase_IsValid(int value);
+const Phase Phase_MIN = kTrain;
+const Phase Phase_MAX = kTest;
+const int Phase_ARRAYSIZE = Phase_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* Phase_descriptor();
+inline const ::std::string& Phase_Name(Phase value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    Phase_descriptor(), value);
+}
+inline bool Phase_Parse(
+    const ::std::string& name, Phase* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<Phase>(
+    Phase_descriptor(), name, value);
+}
+enum ShareOption {
+  kValueOnly = 0,
+  kWhole = 1
+};
+bool ShareOption_IsValid(int value);
+const ShareOption ShareOption_MIN = kValueOnly;
+const ShareOption ShareOption_MAX = kWhole;
+const int ShareOption_ARRAYSIZE = ShareOption_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* ShareOption_descriptor();
+inline const ::std::string& ShareOption_Name(ShareOption value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    ShareOption_descriptor(), value);
+}
+inline bool ShareOption_Parse(
+    const ::std::string& name, ShareOption* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<ShareOption>(
+    ShareOption_descriptor(), name, value);
+}
+enum PartitionType {
+  kDataPartition = 0,
+  kLayerPartition = 1,
+  kNone = 2
+};
+bool PartitionType_IsValid(int value);
+const PartitionType PartitionType_MIN = kDataPartition;
+const PartitionType PartitionType_MAX = kNone;
+const int PartitionType_ARRAYSIZE = PartitionType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* PartitionType_descriptor();
+inline const ::std::string& PartitionType_Name(PartitionType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    PartitionType_descriptor(), value);
+}
+inline bool PartitionType_Parse(
+    const ::std::string& name, PartitionType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<PartitionType>(
+    PartitionType_descriptor(), name, value);
+}
+enum ConnectionType {
+  kOneToOne = 0,
+  kOneToAll = 1
+};
+bool ConnectionType_IsValid(int value);
+const ConnectionType ConnectionType_MIN = kOneToOne;
+const ConnectionType ConnectionType_MAX = kOneToAll;
+const int ConnectionType_ARRAYSIZE = ConnectionType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* ConnectionType_descriptor();
+inline const ::std::string& ConnectionType_Name(ConnectionType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    ConnectionType_descriptor(), value);
+}
+inline bool ConnectionType_Parse(
+    const ::std::string& name, ConnectionType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<ConnectionType>(
+    ConnectionType_descriptor(), name, value);
+}
+// ===================================================================
+
+class ModelProto : public ::google::protobuf::Message {
+ public:
+  ModelProto();
+  virtual ~ModelProto();
+
+  ModelProto(const ModelProto& from);
+
+  inline ModelProto& operator=(const ModelProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ModelProto& default_instance();
+
+  void Swap(ModelProto* other);
+
+  // implements Message ----------------------------------------------
+
+  ModelProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ModelProto& from);
+  void MergeFrom(const ModelProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  typedef ModelProto_GradCalcAlg GradCalcAlg;
+  static const GradCalcAlg kBackPropagation = ModelProto_GradCalcAlg_kBackPropagation;
+  static const GradCalcAlg kContrastiveDivergence = ModelProto_GradCalcAlg_kContrastiveDivergence;
+  static inline bool GradCalcAlg_IsValid(int value) {
+    return ModelProto_GradCalcAlg_IsValid(value);
+  }
+  static const GradCalcAlg GradCalcAlg_MIN =
+    ModelProto_GradCalcAlg_GradCalcAlg_MIN;
+  static const GradCalcAlg GradCalcAlg_MAX =
+    ModelProto_GradCalcAlg_GradCalcAlg_MAX;
+  static const int GradCalcAlg_ARRAYSIZE =
+    ModelProto_GradCalcAlg_GradCalcAlg_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  GradCalcAlg_descriptor() {
+    return ModelProto_GradCalcAlg_descriptor();
+  }
+  static inline const ::std::string& GradCalcAlg_Name(GradCalcAlg value) {
+    return ModelProto_GradCalcAlg_Name(value);
+  }
+  static inline bool GradCalcAlg_Parse(const ::std::string& name,
+      GradCalcAlg* value) {
+    return ModelProto_GradCalcAlg_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  // optional string name = 1;
+  inline bool has_name() const;
+  inline void clear_name();
+  static const int kNameFieldNumber = 1;
+  inline const ::std::string& name() const;
+  inline void set_name(const ::std::string& value);
+  inline void set_name(const char* value);
+  inline void set_name(const char* value, size_t size);
+  inline ::std::string* mutable_name();
+  inline ::std::string* release_name();
+  inline void set_allocated_name(::std::string* name);
+
+  // optional string train_folder = 2 [default = "train"];
+  inline bool has_train_folder() const;
+  inline void clear_train_folder();
+  static const int kTrainFolderFieldNumber = 2;
+  inline const ::std::string& train_folder() const;
+  inline void set_train_folder(const ::std::string& value);
+  inline void set_train_folder(const char* value);
+  inline void set_train_folder(const char* value, size_t size);
+  inline ::std::string* mutable_train_folder();
+  inline ::std::string* release_train_folder();
+  inline void set_allocated_train_folder(::std::string* train_folder);
+
+  // optional string test_folder = 3 [default = "test"];
+  inline bool has_test_folder() const;
+  inline void clear_test_folder();
+  static const int kTestFolderFieldNumber = 3;
+  inline const ::std::string& test_folder() const;
+  inline void set_test_folder(const ::std::string& value);
+  inline void set_test_folder(const char* value);
+  inline void set_test_folder(const char* value, size_t size);
+  inline ::std::string* mutable_test_folder();
+  inline ::std::string* release_test_folder();
+  inline void set_allocated_test_folder(::std::string* test_folder);
+
+  // optional string validation_folder = 4 [default = "validation"];
+  inline bool has_validation_folder() const;
+  inline void clear_validation_folder();
+  static const int kValidationFolderFieldNumber = 4;
+  inline const ::std::string& validation_folder() const;
+  inline void set_validation_folder(const ::std::string& value);
+  inline void set_validation_folder(const char* value);
+  inline void set_validation_folder(const char* value, size_t size);
+  inline ::std::string* mutable_validation_folder();
+  inline ::std::string* release_validation_folder();
+  inline void set_allocated_validation_folder(::std::string* validation_folder);
+
+  // optional int32 display_after_steps = 6 [default = 0];
+  inline bool has_display_after_steps() const;
+  inline void clear_display_after_steps();
+  static const int kDisplayAfterStepsFieldNumber = 6;
+  inline ::google::protobuf::int32 display_after_steps() const;
+  inline void set_display_after_steps(::google::protobuf::int32 value);
+
+  // optional int32 display_frequency = 7 [default = 0];
+  inline bool has_display_frequency() const;
+  inline void clear_display_frequency();
+  static const int kDisplayFrequencyFieldNumber = 7;
+  inline ::google::protobuf::int32 display_frequency() const;
+  inline void set_display_frequency(::google::protobuf::int32 value);
+
+  // optional int32 validation_after_steps = 10 [default = 0];
+  inline bool has_validation_after_steps() const;
+  inline void clear_validation_after_steps();
+  static const int kValidationAfterStepsFieldNumber = 10;
+  inline ::google::protobuf::int32 validation_after_steps() const;
+  inline void set_validation_after_steps(::google::protobuf::int32 value);
+
+  // optional int32 validation_frequency = 11 [default = 0];
+  inline bool has_validation_frequency() const;
+  inline void clear_validation_frequency();
+  static const int kValidationFrequencyFieldNumber = 11;
+  inline ::google::protobuf::int32 validation_frequency() const;
+  inline void set_validation_frequency(::google::protobuf::int32 value);
+
+  // optional int32 test_after_steps = 13 [default = 0];
+  inline bool has_test_after_steps() const;
+  inline void clear_test_after_steps();
+  static const int kTestAfterStepsFieldNumber = 13;
+  inline ::google::protobuf::int32 test_after_steps() const;
+  inline void set_test_after_steps(::google::protobuf::int32 value);
+
+  // optional int32 test_frequency = 14 [default = 0];
+  inline bool has_test_frequency() const;
+  inline void clear_test_frequency();
+  static const int kTestFrequencyFieldNumber = 14;
+  inline ::google::protobuf::int32 test_frequency() const;
+  inline void set_test_frequency(::google::protobuf::int32 value);
+
+  // optional int32 checkpoint_after_steps = 15 [default = 0];
+  inline bool has_checkpoint_after_steps() const;
+  inline void clear_checkpoint_after_steps();
+  static const int kCheckpointAfterStepsFieldNumber = 15;
+  inline ::google::protobuf::int32 checkpoint_after_steps() const;
+  inline void set_checkpoint_after_steps(::google::protobuf::int32 value);
+
+  // optional int32 checkpoint_frequency = 16 [default = 0];
+  inline bool has_checkpoint_frequency() const;
+  inline void clear_checkpoint_frequency();
+  static const int kCheckpointFrequencyFieldNumber = 16;
+  inline ::google::protobuf::int32 checkpoint_frequency() const;
+  inline void set_checkpoint_frequency(::google::protobuf::int32 value);
+
+  // optional bool prefetch = 18 [default = true];
+  inline bool has_prefetch() const;
+  inline void clear_prefetch();
+  static const int kPrefetchFieldNumber = 18;
+  inline bool prefetch() const;
+  inline void set_prefetch(bool value);
+
+  // optional int32 train_steps = 20;
+  inline bool has_train_steps() const;
+  inline void clear_train_steps();
+  static const int kTrainStepsFieldNumber = 20;
+  inline ::google::protobuf::int32 train_steps() const;
+  inline void set_train_steps(::google::protobuf::int32 value);
+
+  // optional int32 validation_steps = 21;
+  inline bool has_validation_steps() const;
+  inline void clear_validation_steps();
+  static const int kValidationStepsFieldNumber = 21;
+  inline ::google::protobuf::int32 validation_steps() const;
+  inline void set_validation_steps(::google::protobuf::int32 value);
+
+  // optional int32 test_steps = 22;
+  inline bool has_test_steps() const;
+  inline void clear_test_steps();
+  static const int kTestStepsFieldNumber = 22;
+  inline ::google::protobuf::int32 test_steps() const;
+  inline void set_test_steps(::google::protobuf::int32 value);
+
+  // optional int32 step = 29 [default = 0];
+  inline bool has_step() const;
+  inline void clear_step();
+  static const int kStepFieldNumber = 29;
+  inline ::google::protobuf::int32 step() const;
+  inline void set_step(::google::protobuf::int32 value);
+
+  // optional .singa.UpdaterProto updater = 31;
+  inline bool has_updater() const;
+  inline void clear_updater();
+  static const int kUpdaterFieldNumber = 31;
+  inline const ::singa::UpdaterProto& updater() const;
+  inline ::singa::UpdaterProto* mutable_updater();
+  inline ::singa::UpdaterProto* release_updater();
+  inline void set_allocated_updater(::singa::UpdaterProto* updater);
+
+  // optional .singa.ModelProto.GradCalcAlg alg = 32 [default = kBackPropagation];
+  inline bool has_alg() const;
+  inline void clear_alg();
+  static const int kAlgFieldNumber = 32;
+  inline ::singa::ModelProto_GradCalcAlg alg() const;
+  inline void set_alg(::singa::ModelProto_GradCalcAlg value);
+
+  // optional bool hogwild = 33 [default = false];
+  inline bool has_hogwild() const;
+  inline void clear_hogwild();
+  static const int kHogwildFieldNumber = 33;
+  inline bool hogwild() const;
+  inline void set_hogwild(bool value);
+
+  // optional .singa.NetProto neuralnet = 40;
+  inline bool has_neuralnet() const;
+  inline void clear_neuralnet();
+  static const int kNeuralnetFieldNumber = 40;
+  inline const ::singa::NetProto& neuralnet() const;
+  inline ::singa::NetProto* mutable_neuralnet();
+  inline ::singa::NetProto* release_neuralnet();
+  inline void set_allocated_neuralnet(::singa::NetProto* neuralnet);
+
+  // optional bool debug = 41 [default = false];
+  inline bool has_debug() const;
+  inline void clear_debug();
+  static const int kDebugFieldNumber = 41;
+  inline bool debug() const;
+  inline void set_debug(bool value);
+
+  // @@protoc_insertion_point(class_scope:singa.ModelProto)
+ private:
+  inline void set_has_name();
+  inline void clear_has_name();
+  inline void set_has_train_folder();
+  inline void clear_has_train_folder();
+  inline void set_has_test_folder();
+  inline void clear_has_test_folder();
+  inline void set_has_validation_folder();
+  inline void clear_has_validation_folder();
+  inline void set_has_display_after_steps();
+  inline void clear_has_display_after_steps();
+  inline void set_has_display_frequency();
+  inline void clear_has_display_frequency();
+  inline void set_has_validation_after_steps();
+  inline void clear_has_validation_after_steps();
+  inline void set_has_validation_frequency();
+  inline void clear_has_validation_frequency();
+  inline void set_has_test_after_steps();
+  inline void clear_has_test_after_steps();
+  inline void set_has_test_frequency();
+  inline void clear_has_test_frequency();
+  inline void set_has_checkpoint_after_steps();
+  inline void clear_has_checkpoint_after_steps();
+  inline void set_has_checkpoint_frequency();
+  inline void clear_has_checkpoint_frequency();
+  inline void set_has_prefetch();
+  inline void clear_has_prefetch();
+  inline void set_has_train_steps();
+  inline void clear_has_train_steps();
+  inline void set_has_validation_steps();
+  inline void clear_has_validation_steps();
+  inline void set_has_test_steps();
+  inline void clear_has_test_steps();
+  inline void set_has_step();
+  inline void clear_has_step();
+  inline void set_has_updater();
+  inline void clear_has_updater();
+  inline void set_has_alg();
+  inline void clear_has_alg();
+  inline void set_has_hogwild();
+  inline void clear_has_hogwild();
+  inline void set_has_neuralnet();
+  inline void clear_has_neuralnet();
+  inline void set_has_debug();
+  inline void clear_has_debug();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::std::string* name_;
+  ::std::string* train_folder_;
+  static ::std::string* _default_train_folder_;
+  ::std::string* test_folder_;
+  static ::std::string* _default_test_folder_;
+  ::std::string* validation_folder_;
+  static ::std::string* _default_validation_folder_;
+  ::google::protobuf::int32 display_after_steps_;
+  ::google::protobuf::int32 display_frequency_;
+  ::google::protobuf::int32 validation_after_steps_;
+  ::google::protobuf::int32 validation_frequency_;
+  ::google::protobuf::int32 test_after_steps_;
+  ::google::protobuf::int32 test_frequency_;
+  ::google::protobuf::int32 checkpoint_after_steps_;
+  ::google::protobuf::int32 checkpoint_frequency_;
+  ::google::protobuf::int32 train_steps_;
+  ::google::protobuf::int32 validation_steps_;
+  ::google::protobuf::int32 test_steps_;
+  ::google::protobuf::int32 step_;
+  ::singa::UpdaterProto* updater_;
+  bool prefetch_;
+  bool hogwild_;
+  bool debug_;
+  int alg_;
+  ::singa::NetProto* neuralnet_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(22 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static ModelProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class NetProto : public ::google::protobuf::Message {
+ public:
+  NetProto();
+  virtual ~NetProto();
+
+  NetProto(const NetProto& from);
+
+  inline NetProto& operator=(const NetProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const NetProto& default_instance();
+
+  void Swap(NetProto* other);
+
+  // implements Message ----------------------------------------------
+
+  NetProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const NetProto& from);
+  void MergeFrom(const NetProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .singa.LayerProto layer = 1;
+  inline int layer_size() const;
+  inline void clear_layer();
+  static const int kLayerFieldNumber = 1;
+  inline const ::singa::LayerProto& layer(int index) const;
+  inline ::singa::LayerProto* mutable_layer(int index);
+  inline ::singa::LayerProto* add_layer();
+  inline const ::google::protobuf::RepeatedPtrField< ::singa::LayerProto >&
+      layer() const;
+  inline ::google::protobuf::RepeatedPtrField< ::singa::LayerProto >*
+      mutable_layer();
+
+  // optional .singa.PartitionType partition_type = 3 [default = kNone];
+  inline bool has_partition_type() const;
+  inline void clear_partition_type();
+  static const int kPartitionTypeFieldNumber = 3;
+  inline ::singa::PartitionType partition_type() const;
+  inline void set_partition_type(::singa::PartitionType value);
+
+  // @@protoc_insertion_point(class_scope:singa.NetProto)
+ private:
+  inline void set_has_partition_type();
+  inline void clear_has_partition_type();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::RepeatedPtrField< ::singa::LayerProto > layer_;
+  int partition_type_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static NetProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class ParamProto : public ::google::protobuf::Message {
+ public:
+  ParamProto();
+  virtual ~ParamProto();
+
+  ParamProto(const ParamProto& from);
+
+  inline ParamProto& operator=(const ParamProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ParamProto& default_instance();
+
+  void Swap(ParamProto* other);
+
+  // implements Message ----------------------------------------------
+
+  ParamProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ParamProto& from);
+  void MergeFrom(const ParamProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  typedef ParamProto_InitMethod InitMethod;
+  static const InitMethod kConstant = ParamProto_InitMethod_kConstant;
+  static const InitMethod kGaussian = ParamProto_InitMethod_kGaussian;
+  static const InitMethod kUniform = ParamProto_InitMethod_kUniform;
+  static const InitMethod kPretrained = ParamProto_InitMethod_kPretrained;
+  static const InitMethod kGaussainSqrtFanIn = ParamProto_InitMethod_kGaussainSqrtFanIn;
+  static const InitMethod kUniformSqrtFanIn = ParamProto_InitMethod_kUniformSqrtFanIn;
+  static const InitMethod kUniformSqrtFanInOut = ParamProto_InitMethod_kUniformSqrtFanInOut;
+  static inline bool InitMethod_IsValid(int value) {
+    return ParamProto_InitMethod_IsValid(value);
+  }
+  static const InitMethod InitMethod_MIN =
+    ParamProto_InitMethod_InitMethod_MIN;
+  static const InitMethod InitMethod_MAX =
+    ParamProto_InitMethod_InitMethod_MAX;
+  static const int InitMethod_ARRAYSIZE =
+    ParamProto_InitMethod_InitMethod_ARRAYSIZE;
+  static inline const ::google::protobuf::EnumDescriptor*
+  InitMethod_descriptor() {
+    return ParamProto_InitMethod_descriptor();
+  }
+  static inline const ::std::string& InitMethod_Name(InitMethod value) {
+    return ParamProto_InitMethod_Name(value);
+  }
+  static inline bool InitMethod_Parse(const ::std::string& name,
+      InitMethod* value) {
+    return ParamProto_InitMethod_Parse(name, value);
+  }
+
+  // accessors -------------------------------------------------------
+
+  // optional string name = 1;
+  inline bool has_name() const;
+  inline void clear_name();
+  static const int kNameFieldNumber = 1;
+  inline const ::std::string& name() const;
+  inline void set_name(const ::std::string& value);
+  inline void set_name(const char* value);
+  inline void set_name(const char* value, size_t size);
+  inline ::std::string* mutable_name();
+  inline ::std::string* release_name();
+  inline void set_allocated_name(::std::string* name);
+
+  // optional int32 id = 2;
+  inline bool has_id() const;
+  inline void clear_id();
+  static const int kIdFieldNumber = 2;
+  inline ::google::protobuf::int32 id() const;
+  inline void set_id(::google::protobuf::int32 value);
+
+  // repeated int32 shape = 3;
+  inline int shape_size() const;
+  inline void clear_shape();
+  static const int kShapeFieldNumber = 3;
+  inline ::google::protobuf::int32 shape(int index) const;
+  inline void set_shape(int index, ::google::protobuf::int32 value);
+  inline void add_shape(::google::protobuf::int32 value);
+  inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      shape() const;
+  inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_shape();
+
+  // optional int32 split_threshold = 4 [default = 5000000];
+  inline bool has_split_threshold() const;
+  inline void clear_split_threshold();
+  static const int kSplitThresholdFieldNumber = 4;
+  inline ::google::protobuf::int32 split_threshold() const;
+  inline void set_split_threshold(::google::protobuf::int32 value);
+
+  // optional int32 partition_dim = 5 [default = -1];
+  inline bool has_partition_dim() const;
+  inline void clear_partition_dim();
+  static const int kPartitionDimFieldNumber = 5;
+  inline ::google::protobuf::int32 partition_dim() const;
+  inline void set_partition_dim(::google::protobuf::int32 value);
+
+  // optional int32 version = 6;
+  inline bool has_version() const;
+  inline void clear_version();
+  static const int kVersionFieldNumber = 6;
+  inline ::google::protobuf::int32 version() const;
+  inline void set_version(::google::protobuf::int32 value);
+
+  // optional .singa.ParamProto.InitMethod init_method = 7 [default = kConstant];
+  inline bool has_init_method() const;
+  inline void clear_init_method();
+  static const int kInitMethodFieldNumber = 7;
+  inline ::singa::ParamProto_InitMethod init_method() const;
+  inline void set_init_method(::singa::ParamProto_InitMethod value);
+
+  // optional float value = 8 [default = 1];
+  inline bool has_value() const;
+  inline void clear_value();
+  static const int kValueFieldNumber = 8;
+  inline float value() const;
+  inline void set_value(float value);
+
+  // optional float low = 9 [default = -1];
+  inline bool has_low() const;
+  inline void clear_low();
+  static const int kLowFieldNumber = 9;
+  inline float low() const;
+  inline void set_low(float value);
+
+  // optional float high = 10 [default = 1];
+  inline bool has_high() const;
+  inline void clear_high();
+  static const int kHighFieldNumber = 10;
+  inline float high() const;
+  inline void set_high(float value);
+
+  // optional float mean = 11 [default = 0];
+  inline bool has_mean() const;
+  inline void clear_mean();
+  static const int kMeanFieldNumber = 11;
+  inline float mean() const;
+  inline void set_mean(float value);
+
+  // optional float std = 12 [default = 1];
+  inline bool has_std() const;
+  inline void clear_std();
+  static const int kStdFieldNumber = 12;
+  inline float std() const;
+  inline void set_std(float value);
+
+  // optional float learning_rate_multiplier = 13 [default = 1];
+  inline bool has_learning_rate_multiplier() const;
+  inline void clear_learning_rate_multiplier();
+  static const int kLearningRateMultiplierFieldNumber = 13;
+  inline float learning_rate_multiplier() const;
+  inline void set_learning_rate_multiplier(float value);
+
+  // optional float weight_decay_multiplier = 14 [default = 1];
+  inline bool has_weight_decay_multiplier() const;
+  inline void clear_weight_decay_multiplier();
+  static const int kWeightDecayMultiplierFieldNumber = 14;
+  inline float weight_decay_multiplier() const;
+  inline void set_weight_decay_multiplier(float value);
+
+  // @@protoc_insertion_point(class_scope:singa.ParamProto)
+ private:
+  inline void set_has_name();
+  inline void clear_has_name();
+  inline void set_has_id();
+  inline void clear_has_id();
+  inline void set_has_split_threshold();
+  inline void clear_has_split_threshold();
+  inline void set_has_partition_dim();
+  inline void clear_has_partition_dim();
+  inline void set_has_version();
+  inline void clear_has_version();
+  inline void set_has_init_method();
+  inline void clear_has_init_method();
+  inline void set_has_value();
+  inline void clear_has_value();
+  inline void set_has_low();
+  inline void clear_has_low();
+  inline void set_has_high();
+  inline void clear_has_high();
+  inline void set_has_mean();
+  inline void clear_has_mean();
+  inline void set_has_std();
+  inline void clear_has_std();
+  inline void set_has_learning_rate_multiplier();
+  inline void clear_has_learning_rate_multiplier();
+  inline void set_has_weight_decay_multiplier();
+  inline void clear_has_weight_decay_multiplier();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::std::string* name_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > shape_;
+  ::google::protobuf::int32 id_;
+  ::google::protobuf::int32 split_threshold_;
+  ::google::protobuf::int32 partition_dim_;
+  ::google::protobuf::int32 version_;
+  int init_method_;
+  float value_;
+  float low_;
+  float high_;
+  float mean_;
+  float std_;
+  float learning_rate_multiplier_;
+  float weight_decay_multiplier_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(14 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static ParamProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class BlobProtos : public ::google::protobuf::Message {
+ public:
+  BlobProtos();
+  virtual ~BlobProtos();
+
+  BlobProtos(const BlobProtos& from);
+
+  inline BlobProtos& operator=(const BlobProtos& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const BlobProtos& default_instance();
+
+  void Swap(BlobProtos* other);
+
+  // implements Message ----------------------------------------------
+
+  BlobProtos* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const BlobProtos& from);
+  void MergeFrom(const BlobProtos& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .singa.BlobProto blobs = 1;
+  inline int blobs_size() const;
+  inline void clear_blobs();
+  static const int kBlobsFieldNumber = 1;
+  inline const ::singa::BlobProto& blobs(int index) const;
+  inline ::singa::BlobProto* mutable_blobs(int index);
+  inline ::singa::BlobProto* add_blobs();
+  inline const ::google::protobuf::RepeatedPtrField< ::singa::BlobProto >&
+      blobs() const;
+  inline ::google::protobuf::RepeatedPtrField< ::singa::BlobProto >*
+      mutable_blobs();
+
+  // repeated int32 ids = 2;
+  inline int ids_size() const;
+  inline void clear_ids();
+  static const int kIdsFieldNumber = 2;
+  inline ::google::protobuf::int32 ids(int index) const;
+  inline void set_ids(int index, ::google::protobuf::int32 value);
+  inline void add_ids(::google::protobuf::int32 value);
+  inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      ids() const;
+  inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_ids();
+
+  // repeated string names = 3;
+  inline int names_size() const;
+  inline void clear_names();
+  static const int kNamesFieldNumber = 3;
+  inline const ::std::string& names(int index) const;
+  inline ::std::string* mutable_names(int index);
+  inline void set_names(int index, const ::std::string& value);
+  inline void set_names(int index, const char* value);
+  inline void set_names(int index, const char* value, size_t size);
+  inline ::std::string* add_names();
+  inline void add_names(const ::std::string& value);
+  inline void add_names(const char* value);
+  inline void add_names(const char* value, size_t size);
+  inline const ::google::protobuf::RepeatedPtrField< ::std::string>& names() const;
+  inline ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_names();
+
+  // @@protoc_insertion_point(class_scope:singa.BlobProtos)
+ private:
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::RepeatedPtrField< ::singa::BlobProto > blobs_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > ids_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> names_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(3 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static BlobProtos* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class LayerProto : public ::google::protobuf::Message {
+ public:
+  LayerProto();
+  virtual ~LayerProto();
+
+  LayerProto(const LayerProto& from);
+
+  inline LayerProto& operator=(const LayerProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const LayerProto& default_instance();
+
+  void Swap(LayerProto* other);
+
+  // implements Message ----------------------------------------------
+
+  LayerProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const LayerProto& from);
+  void MergeFrom(const LayerProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional string name = 1;
+  inline bool has_name() const;
+  inline void clear_name();
+  static const int kNameFieldNumber = 1;
+  inline const ::std::string& name() const;
+  inline void set_name(const ::std::string& value);
+  inline void set_name(const char* value);
+  inline void set_name(const char* value, size_t size);
+  inline ::std::string* mutable_name();
+  inline ::std::string* release_name();
+  inline void set_allocated_name(::std::string* name);
+
+  // optional string type = 2;
+  inline bool has_type() const;
+  inline void clear_type();
+  static const int kTypeFieldNumber = 2;
+  inline const ::std::string& type() const;
+  inline void set_type(const ::std::string& value);
+  inline void set_type(const char* value);
+  inline void set_type(const char* value, size_t size);
+  inline ::std::string* mutable_type();
+  inline ::std::string* release_type();
+  inline void set_allocated_type(::std::string* type);
+
+  // repeated string srclayers = 3;
+  inline int srclayers_size() const;
+  inline void clear_srclayers();
+  static const int kSrclayersFieldNumber = 3;
+  inline const ::std::string& srclayers(int index) const;
+  inline ::std::string* mutable_srclayers(int index);
+  inline void set_srclayers(int index, const ::std::string& value);
+  inline void set_srclayers(int index, const char* value);
+  inline void set_srclayers(int index, const char* value, size_t size);
+  inline ::std::string* add_srclayers();
+  inline void add_srclayers(const ::std::string& value);
+  inline void add_srclayers(const char* value);
+  inline void add_srclayers(const char* value, size_t size);
+  inline const ::google::protobuf::RepeatedPtrField< ::std::string>& srclayers() const;
+  inline ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_srclayers();
+
+  // optional int32 locationid = 4 [default = 0];
+  inline bool has_locationid() const;
+  inline void clear_locationid();
+  static const int kLocationidFieldNumber = 4;
+  inline ::google::protobuf::int32 locationid() const;
+  inline void set_locationid(::google::protobuf::int32 value);
+
+  // optional int32 partitionid = 5 [default = 0];
+  inline bool has_partitionid() const;
+  inline void clear_partitionid();
+  static const int kPartitionidFieldNumber = 5;
+  inline ::google::protobuf::int32 partitionid() const;
+  inline void set_partitionid(::google::protobuf::int32 value);
+
+  // optional .singa.PartitionType partition_type = 6;
+  inline bool has_partition_type() const;
+  inline void clear_partition_type();
+  static const int kPartitionTypeFieldNumber = 6;
+  inline ::singa::PartitionType partition_type() const;
+  inline void set_partition_type(::singa::PartitionType value);
+
+  // repeated string share_ary = 11;
+  inline int share_ary_size() const;
+  inline void clear_share_ary();
+  static const int kShareAryFieldNumber = 11;
+  inline const ::std::string& share_ary(int index) const;
+  inline ::std::string* mutable_share_ary(int index);
+  inline void set_share_ary(int index, const ::std::string& value);
+  inline void set_share_ary(int index, const char* value);
+  inline void set_share_ary(int index, const char* value, size_t size);
+  inline ::std::string* add_share_ary();
+  inline void add_share_ary(const ::std::string& value);
+  inline void add_share_ary(const char* value);
+  inline void add_share_ary(const char* value, size_t size);
+  inline const ::google::protobuf::RepeatedPtrField< ::std::string>& share_ary() const;
+  inline ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_share_ary();
+
+  // repeated .singa.ParamProto param = 12;
+  inline int param_size() const;
+  inline void clear_param();
+  static const int kParamFieldNumber = 12;
+  inline const ::singa::ParamProto& param(int index) const;
+  inline ::singa::ParamProto* mutable_param(int index);
+  inline ::singa::ParamProto* add_param();
+  inline const ::google::protobuf::RepeatedPtrField< ::singa::ParamProto >&
+      param() const;
+  inline ::google::protobuf::RepeatedPtrField< ::singa::ParamProto >*
+      mutable_param();
+
+  // repeated string share_param = 13;
+  inline int share_param_size() const;
+  inline void clear_share_param();
+  static const int kShareParamFieldNumber = 13;
+  inline const ::std::string& share_param(int index) const;
+  inline ::std::string* mutable_share_param(int index);
+  inline void set_share_param(int index, const ::std::string& value);
+  inline void set_share_param(int index, const char* value);
+  inline void set_share_param(int index, const char* value, size_t size);
+  inline ::std::string* add_share_param();
+  inline void add_share_param(const ::std::string& value);
+  inline void add_share_param(const char* value);
+  inline void add_share_param(const char* value, size_t size);
+  inline const ::google::protobuf::RepeatedPtrField< ::std::string>& share_param() const;
+  inline ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_share_param();
+
+  // repeated .singa.Phase exclude = 20;
+  inline int exclude_size() const;
+  inline void clear_exclude();
+  static const int kExcludeFieldNumber = 20;
+  inline ::singa::Phase exclude(int index) const;
+  inline void set_exclude(int index, ::singa::Phase value);
+  inline void add_exclude(::singa::Phase value);
+  inline const ::google::protobuf::RepeatedField<int>& exclude() const;
+  inline ::google::protobuf::RepeatedField<int>* mutable_exclude();
+
+  // optional .singa.ConvolutionProto convolution_param = 21;
+  inline bool has_convolution_param() const;
+  inline void clear_convolution_param();
+  static const int kConvolutionParamFieldNumber = 21;
+  inline const ::singa::ConvolutionProto& convolution_param() const;
+  inline ::singa::ConvolutionProto* mutable_convolution_param();
+  inline ::singa::ConvolutionProto* release_convolution_param();
+  inline void set_allocated_convolution_param(::singa::ConvolutionProto* convolution_param);
+
+  // optional .singa.ConcateProto concate_param = 31;
+  inline bool has_concate_param() const;
+  inline void clear_concate_param();
+  static const int kConcateParamFieldNumber = 31;
+  inline const ::singa::ConcateProto& concate_param() const;
+  inline ::singa::ConcateProto* mutable_concate_param();
+  inline ::singa::ConcateProto* release_concate_param();
+  inline void set_allocated_concate_param(::singa::ConcateProto* concate_param);
+
+  // optional .singa.DataProto data_param = 22;
+  inline bool has_data_param() const;
+  inline void clear_data_param();
+  static const int kDataParamFieldNumber = 22;
+  inline const ::singa::DataProto& data_param() const;
+  inline ::singa::DataProto* mutable_data_param();
+  inline ::singa::DataProto* release_data_param();
+  inline void set_allocated_data_param(::singa::DataProto* data_param);
+
+  // optional .singa.DropoutProto dropout_param = 23;
+  inline bool has_dropout_param() const;
+  inline void clear_dropout_param();
+  static const int kDropoutParamFieldNumber = 23;
+  inline const ::singa::DropoutProto& dropout_param() const;
+  inline ::singa::DropoutProto* mutable_dropout_param();
+  inline ::singa::DropoutProto* release_dropout_param();
+  inline void set_allocated_dropout_param(::singa::DropoutProto* dropout_param);
+
+  // optional .singa.InnerProductProto inner_product_param = 24;
+  inline bool has_inner_product_param() const;
+  inline void clear_inner_product_param();
+  static const int kInnerProductParamFieldNumber = 24;
+  inline const ::singa::InnerProductProto& inner_product_param() const;
+  inline ::singa::InnerProductProto* mutable_inner_product_param();
+  inline ::singa::InnerProductProto* release_inner_product_param();
+  inline void set_allocated_inner_product_param(::singa::InnerProductProto* inner_product_param);
+
+  // optional .singa.LRNProto lrn_param = 25;
+  inline bool has_lrn_param() const;
+  inline void clear_lrn_param();
+  static const int kLrnParamFieldNumber = 25;
+  inline const ::singa::LRNProto& lrn_param() const;
+  inline ::singa::LRNProto* mutable_lrn_param();
+  inline ::singa::LRNProto* release_lrn_param();
+  inline void set_allocated_lrn_param(::singa::LRNProto* lrn_param);
+
+  // optional .singa.MnistProto mnist_param = 26;
+  inline bool has_mnist_param() const;
+  inline void clear_mnist_param();
+  static const int kMnistParamFieldNumber = 26;
+  inline const ::singa::MnistProto& mnist_param() const;
+  inline ::singa::MnistProto* mutable_mnist_param();
+  inline ::singa::MnistProto* release_mnist_param();
+  inline void set_allocated_mnist_param(::singa::MnistProto* mnist_param);
+
+  // optional .singa.PoolingProto pooling_param = 27;
+  inline bool has_pooling_param() const;
+  inline void clear_pooling_param();
+  static const int kPoolingParamFieldNumber = 27;
+  inline const ::singa::PoolingProto& pooling_param() const;
+  inline ::singa::PoolingProto* mutable_pooling_param();
+  inline ::singa::PoolingProto* release_pooling_param();
+  inline void set_allocated_pooling_param(::singa::PoolingProto* pooling_param);
+
+  // optional .singa.SliceProto slice_param = 32;
+  inline bool has_slice_param() const;
+  inline void clear_slice_param();
+  static const int kSliceParamFieldNumber = 32;
+  inline const ::singa::SliceProto& slice_param() const;
+  inline ::singa::SliceProto* mutable_slice_param();
+  inline ::singa::SliceProto* release_slice_param();
+  inline void set_allocated_slice_param(::singa::SliceProto* slice_param);
+
+  // optional .singa.SplitProto split_param = 33;
+  inline bool has_split_param() const;
+  inline void clear_split_param();
+  static const int kSplitParamFieldNumber = 33;
+  inline const ::singa::SplitProto& split_param() const;
+  inline ::singa::SplitProto* mutable_split_param();
+  inline ::singa::SplitProto* release_split_param();
+  inline void set_allocated_split_param(::singa::SplitProto* split_param);
+
+  // optional .singa.ReLUProto relu_param = 28;
+  inline bool has_relu_param() const;
+  inline void clear_relu_param();
+  static const int kReluParamFieldNumber = 28;
+  inline const ::singa::ReLUProto& relu_param() const;
+  inline ::singa::ReLUProto* mutable_relu_param();
+  inline ::singa::ReLUProto* release_relu_param();
+  inline void set_allocated_relu_param(::singa::ReLUProto* relu_param);
+
+  // optional .singa.RGBImage rgbimage_param = 34;
+  inline bool has_rgbimage_param() const;
+  inline void clear_rgbimage_param();
+  static const int kRgbimageParamFieldNumber = 34;
+  inline const ::singa::RGBImage& rgbimage_param() const;
+  inline ::singa::RGBImage* mutable_rgbimage_param();
+  inline ::singa::RGBImage* release_rgbimage_param();
+  inline void set_allocated_rgbimage_param(::singa::RGBImage* rgbimage_param);
+
+  // optional .singa.SoftmaxLossProto softmaxloss_param = 29;
+  inline bool has_softmaxloss_param() const;
+  inline void clear_softmaxloss_param();
+  static const int kSoftmaxlossParamFieldNumber = 29;
+  inline const ::singa::SoftmaxLossProto& softmaxloss_param() const;
+  inline ::singa::SoftmaxLossProto* mutable_softmaxloss_param();
+  inline ::singa::SoftmaxLossProto* release_softmaxloss_param();
+  inline void set_allocated_softmaxloss_param(::singa::SoftmaxLossProto* softmaxloss_param);
+
+  // optional .singa.TanhProto tanh_param = 30;
+  inline bool has_tanh_param() const;
+  inline void clear_tanh_param();
+  static const int kTanhParamFieldNumber = 30;
+  inline const ::singa::TanhProto& tanh_param() const;
+  inline ::singa::TanhProto* mutable_tanh_param();
+  inline ::singa::TanhProto* release_tanh_param();
+  inline void set_allocated_tanh_param(::singa::TanhProto* tanh_param);
+
+  // @@protoc_insertion_point(class_scope:singa.LayerProto)
+ private:
+  inline void set_has_name();
+  inline void clear_has_name();
+  inline void set_has_type();
+  inline void clear_has_type();
+  inline void set_has_locationid();
+  inline void clear_has_locationid();
+  inline void set_has_partitionid();
+  inline void clear_has_partitionid();
+  inline void set_has_partition_type();
+  inline void clear_has_partition_type();
+  inline void set_has_convolution_param();
+  inline void clear_has_convolution_param();
+  inline void set_has_concate_param();
+  inline void clear_has_concate_param();
+  inline void set_has_data_param();
+  inline void clear_has_data_param();
+  inline void set_has_dropout_param();
+  inline void clear_has_dropout_param();
+  inline void set_has_inner_product_param();
+  inline void clear_has_inner_product_param();
+  inline void set_has_lrn_param();
+  inline void clear_has_lrn_param();
+  inline void set_has_mnist_param();
+  inline void clear_has_mnist_param();
+  inline void set_has_pooling_param();
+  inline void clear_has_pooling_param();
+  inline void set_has_slice_param();
+  inline void clear_has_slice_param();
+  inline void set_has_split_param();
+  inline void clear_has_split_param();
+  inline void set_has_relu_param();
+  inline void clear_has_relu_param();
+  inline void set_has_rgbimage_param();
+  inline void clear_has_rgbimage_param();
+  inline void set_has_softmaxloss_param();
+  inline void clear_has_softmaxloss_param();
+  inline void set_has_tanh_param();
+  inline void clear_has_tanh_param();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::std::string* name_;
+  ::std::string* type_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> srclayers_;
+  ::google::protobuf::int32 locationid_;
+  ::google::protobuf::int32 partitionid_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> share_ary_;
+  ::google::protobuf::RepeatedPtrField< ::singa::ParamProto > param_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> share_param_;
+  ::google::protobuf::RepeatedField<int> exclude_;
+  ::singa::ConvolutionProto* convolution_param_;
+  ::singa::ConcateProto* concate_param_;
+  ::singa::DataProto* data_param_;
+  ::singa::DropoutProto* dropout_param_;
+  ::singa::InnerProductProto* inner_product_param_;
+  ::singa::LRNProto* lrn_param_;
+  ::singa::MnistProto* mnist_param_;
+  ::singa::PoolingProto* pooling_param_;
+  ::singa::SliceProto* slice_param_;
+  ::singa::SplitProto* split_param_;
+  ::singa::ReLUProto* relu_param_;
+  ::singa::RGBImage* rgbimage_param_;
+  ::singa::SoftmaxLossProto* softmaxloss_param_;
+  ::singa::TanhProto* tanh_param_;
+  int partition_type_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(24 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static LayerProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class RGBImage : public ::google::protobuf::Message {
+ public:
+  RGBImage();
+  virtual ~RGBImage();
+
+  RGBImage(const RGBImage& from);
+
+  inline RGBImage& operator=(const RGBImage& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const RGBImage& default_instance();
+
+  void Swap(RGBImage* other);
+
+  // implements Message ----------------------------------------------
+
+  RGBImage* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const RGBImage& from);
+  void MergeFrom(const RGBImage& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional float scale = 1 [default = 1];
+  inline bool has_scale() const;
+  inline void clear_scale();
+  static const int kScaleFieldNumber = 1;
+  inline float scale() const;
+  inline void set_scale(float value);
+
+  // optional int32 cropsize = 2 [default = 0];
+  inline bool has_cropsize() const;
+  inline void clear_cropsize();
+  static const int kCropsizeFieldNumber = 2;
+  inline ::google::protobuf::int32 cropsize() const;
+  inline void set_cropsize(::google::protobuf::int32 value);
+
+  // optional bool mirror = 3 [default = false];
+  inline bool has_mirror() const;
+  inline void clear_mirror();
+  static const int kMirrorFieldNumber = 3;
+  inline bool mirror() const;
+  inline void set_mirror(bool value);
+
+  // optional string meanfile = 4;
+  inline bool has_meanfile() const;
+  inline void clear_meanfile();
+  static const int kMeanfileFieldNumber = 4;
+  inline const ::std::string& meanfile() const;
+  inline void set_meanfile(const ::std::string& value);
+  inline void set_meanfile(const char* value);
+  inline void set_meanfile(const char* value, size_t size);
+  inline ::std::string* mutable_meanfile();
+  inline ::std::string* release_meanfile();
+  inline void set_allocated_meanfile(::std::string* meanfile);
+
+  // @@protoc_insertion_point(class_scope:singa.RGBImage)
+ private:
+  inline void set_has_scale();
+  inline void clear_has_scale();
+  inline void set_has_cropsize();
+  inline void clear_has_cropsize();
+  inline void set_has_mirror();
+  inline void clear_has_mirror();
+  inline void set_has_meanfile();
+  inline void clear_has_meanfile();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  float scale_;
+  ::google::protobuf::int32 cropsize_;
+  ::std::string* meanfile_;
+  bool mirror_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(4 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static RGBImage* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class SplitProto : public ::google::protobuf::Message {
+ public:
+  SplitProto();
+  virtual ~SplitProto();
+
+  SplitProto(const SplitProto& from);
+
+  inline SplitProto& operator=(const SplitProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const SplitProto& default_instance();
+
+  void Swap(SplitProto* other);
+
+  // implements Message ----------------------------------------------
+
+  SplitProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const SplitProto& from);
+  void MergeFrom(const SplitProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional int32 num_splits = 1;
+  inline bool has_num_splits() const;
+  inline void clear_num_splits();
+  static const int kNumSplitsFieldNumber = 1;
+  inline ::google::protobuf::int32 num_splits() const;
+  inline void set_num_splits(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:singa.SplitProto)
+ private:
+  inline void set_has_num_splits();
+  inline void clear_has_num_splits();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 num_splits_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(1 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static SplitProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class TanhProto : public ::google::protobuf::Message {
+ public:
+  TanhProto();
+  virtual ~TanhProto();
+
+  TanhProto(const TanhProto& from);
+
+  inline TanhProto& operator=(const TanhProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const TanhProto& default_instance();
+
+  void Swap(TanhProto* other);
+
+  // implements Message ----------------------------------------------
+
+  TanhProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const TanhProto& from);
+  void MergeFrom(const TanhProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional float outer_scale = 1 [default = 1];
+  inline bool has_outer_scale() const;
+  inline void clear_outer_scale();
+  static const int kOuterScaleFieldNumber = 1;
+  inline float outer_scale() const;
+  inline void set_outer_scale(float value);
+
+  // optional float inner_scale = 2 [default = 1];
+  inline bool has_inner_scale() const;
+  inline void clear_inner_scale();
+  static const int kInnerScaleFieldNumber = 2;
+  inline float inner_scale() const;
+  inline void set_inner_scale(float value);
+
+  // @@protoc_insertion_point(class_scope:singa.TanhProto)
+ private:
+  inline void set_has_outer_scale();
+  inline void clear_has_outer_scale();
+  inline void set_has_inner_scale();
+  inline void clear_has_inner_scale();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  float outer_scale_;
+  float inner_scale_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static TanhProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class SoftmaxLossProto : public ::google::protobuf::Message {
+ public:
+  SoftmaxLossProto();
+  virtual ~SoftmaxLossProto();
+
+  SoftmaxLossProto(const SoftmaxLossProto& from);
+
+  inline SoftmaxLossProto& operator=(const SoftmaxLossProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const SoftmaxLossProto& default_instance();
+
+  void Swap(SoftmaxLossProto* other);
+
+  // implements Message ----------------------------------------------
+
+  SoftmaxLossProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const SoftmaxLossProto& from);
+  void MergeFrom(const SoftmaxLossProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional int32 topk = 1 [default = 1];
+  inline bool has_topk() const;
+  inline void clear_topk();
+  static const int kTopkFieldNumber = 1;
+  inline ::google::protobuf::int32 topk() const;
+  inline void set_topk(::google::protobuf::int32 value);
+
+  // optional float scale = 2 [default = 1];
+  inline bool has_scale() const;
+  inline void clear_scale();
+  static const int kScaleFieldNumber = 2;
+  inline float scale() const;
+  inline void set_scale(float value);
+
+  // @@protoc_insertion_point(class_scope:singa.SoftmaxLossProto)
+ private:
+  inline void set_has_topk();
+  inline void clear_has_topk();
+  inline void set_has_scale();
+  inline void clear_has_scale();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 topk_;
+  float scale_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static SoftmaxLossProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class ConvolutionProto : public ::google::protobuf::Message {
+ public:
+  ConvolutionProto();
+  virtual ~ConvolutionProto();
+
+  ConvolutionProto(const ConvolutionProto& from);
+
+  inline ConvolutionProto& operator=(const ConvolutionProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ConvolutionProto& default_instance();
+
+  void Swap(ConvolutionProto* other);
+
+  // implements Message ----------------------------------------------
+
+  ConvolutionProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ConvolutionProto& from);
+  void MergeFrom(const ConvolutionProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional uint32 num_filters = 1;
+  inline bool has_num_filters() const;
+  inline void clear_num_filters();
+  static const int kNumFiltersFieldNumber = 1;
+  inline ::google::protobuf::uint32 num_filters() const;
+  inline void set_num_filters(::google::protobuf::uint32 value);
+
+  // optional bool bias_term = 2 [default = true];
+  inline bool has_bias_term() const;
+  inline void clear_bias_term();
+  static const int kBiasTermFieldNumber = 2;
+  inline bool bias_term() const;
+  inline void set_bias_term(bool value);
+
+  // optional uint32 pad = 3 [default = 0];
+  inline bool has_pad() const;
+  inline void clear_pad();
+  static const int kPadFieldNumber = 3;
+  inline ::google::protobuf::uint32 pad() const;
+  inline void set_pad(::google::protobuf::uint32 value);
+
+  // optional uint32 stride = 4 [default = 1];
+  inline bool has_stride() const;
+  inline void clear_stride();
+  static const int kStrideFieldNumber = 4;
+  inline ::google::protobuf::uint32 stride() const;
+  inline void set_stride(::google::protobuf::uint32 value);
+
+  // required uint32 kernel = 5;
+  inline bool has_kernel() const;
+  inline void clear_kernel();
+  static const int kKernelFieldNumber = 5;
+  inline ::google::protobuf::uint32 kernel() const;
+  inline void set_kernel(::google::protobuf::uint32 value);
+
+  // @@protoc_insertion_point(class_scope:singa.ConvolutionProto)
+ private:
+  inline void set_has_num_filters();
+  inline void clear_has_num_filters();
+  inline void set_has_bias_term();
+  inline void clear_has_bias_term();
+  inline void set_has_pad();
+  inline void clear_has_pad();
+  inline void set_has_stride();
+  inline void clear_has_stride();
+  inline void set_has_kernel();
+  inline void clear_has_kernel();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::uint32 num_filters_;
+  bool bias_term_;
+  ::google::protobuf::uint32 pad_;
+  ::google::protobuf::uint32 stride_;
+  ::google::protobuf::uint32 kernel_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(5 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static ConvolutionProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class ConcateProto : public ::google::protobuf::Message {
+ public:
+  ConcateProto();
+  virtual ~ConcateProto();
+
+  ConcateProto(const ConcateProto& from);
+
+  inline ConcateProto& operator=(const ConcateProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ConcateProto& default_instance();
+
+  void Swap(ConcateProto* other);
+
+  // implements Message ----------------------------------------------
+
+  ConcateProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ConcateProto& from);
+  void MergeFrom(const ConcateProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional int32 concate_dimension = 1;
+  inline bool has_concate_dimension() const;
+  inline void clear_concate_dimension();
+  static const int kConcateDimensionFieldNumber = 1;
+  inline ::google::protobuf::int32 concate_dimension() const;
+  inline void set_concate_dimension(::google::protobuf::int32 value);
+
+  // optional int32 concate_num = 2;
+  inline bool has_concate_num() const;
+  inline void clear_concate_num();
+  static const int kConcateNumFieldNumber = 2;
+  inline ::google::protobuf::int32 concate_num() const;
+  inline void set_concate_num(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:singa.ConcateProto)
+ private:
+  inline void set_has_concate_dimension();
+  inline void clear_has_concate_dimension();
+  inline void set_has_concate_num();
+  inline void clear_has_concate_num();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 concate_dimension_;
+  ::google::protobuf::int32 concate_num_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static ConcateProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class DataProto : public ::google::protobuf::Message {
+ public:
+  DataProto();
+  virtual ~DataProto();
+
+  DataProto(const DataProto& from);
+
+  inline DataProto& operator=(const DataProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const DataProto& default_instance();
+
+  void Swap(DataProto* other);
+
+  // implements Message ----------------------------------------------
+
+  DataProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const DataProto& from);
+  void MergeFrom(const DataProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional string source = 1;
+  inline bool has_source() const;
+  inline void clear_source();
+  static const int kSourceFieldNumber = 1;
+  inline const ::std::string& source() const;
+  inline void set_source(const ::std::string& value);
+  inline void set_source(const char* value);
+  inline void set_source(const char* value, size_t size);
+  inline ::std::string* mutable_source();
+  inline ::std::string* release_source();
+  inline void set_allocated_source(::std::string* source);
+
+  // optional string path = 2;
+  inline bool has_path() const;
+  inline void clear_path();
+  static const int kPathFieldNumber = 2;
+  inline const ::std::string& path() const;
+  inline void set_path(const ::std::string& value);
+  inline void set_path(const char* value);
+  inline void set_path(const char* value, size_t size);
+  inline ::std::string* mutable_path();
+  inline ::std::string* release_path();
+  inline void set_allocated_path(::std::string* path);
+
+  // optional uint32 batchsize = 4;
+  inline bool has_batchsize() const;
+  inline void clear_batchsize();
+  static const int kBatchsizeFieldNumber = 4;
+  inline ::google::protobuf::uint32 batchsize() const;
+  inline void set_batchsize(::google::protobuf::uint32 value);
+
+  // optional uint32 random_skip = 5 [default = 0];
+  inline bool has_random_skip() const;
+  inline void clear_random_skip();
+  static const int kRandomSkipFieldNumber = 5;
+  inline ::google::protobuf::uint32 random_skip() const;
+  inline void set_random_skip(::google::protobuf::uint32 value);
+
+  // @@protoc_insertion_point(class_scope:singa.DataProto)
+ private:
+  inline void set_has_source();
+  inline void clear_has_source();
+  inline void set_has_path();
+  inline void clear_has_path();
+  inline void set_has_batchsize();
+  inline void clear_has_batchsize();
+  inline void set_has_random_skip();
+  inline void clear_has_random_skip();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::std::string* source_;
+  ::std::string* path_;
+  ::google::protobuf::uint32 batchsize_;
+  ::google::protobuf::uint32 random_skip_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(4 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static DataProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class MnistProto : public ::google::protobuf::Message {
+ public:
+  MnistProto();
+  virtual ~MnistProto();
+
+  MnistProto(const MnistProto& from);
+
+  inline MnistProto& operator=(const MnistProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const MnistProto& default_instance();
+
+  void Swap(MnistProto* other);
+
+  // implements Message ----------------------------------------------
+
+  MnistProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const MnistProto& from);
+  void MergeFrom(const MnistProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional int32 kernel = 1 [default = 0];
+  inline bool has_kernel() const;
+  inline void clear_kernel();
+  static const int kKernelFieldNumber = 1;
+  inline ::google::protobuf::int32 kernel() const;
+  inline void set_kernel(::google::protobuf::int32 value);
+
+  // optional float sigma = 2 [default = 0];
+  inline bool has_sigma() const;
+  inline void clear_sigma();
+  static const int kSigmaFieldNumber = 2;
+  inline float sigma() const;
+  inline void set_sigma(float value);
+
+  // optional float alpha = 3 [default = 0];
+  inline bool has_alpha() const;
+  inline void clear_alpha();
+  static const int kAlphaFieldNumber = 3;
+  inline float alpha() const;
+  inline void set_alpha(float value);
+
+  // optional float beta = 4 [default = 0];
+  inline bool has_beta() const;
+  inline void clear_beta();
+  static const int kBetaFieldNumber = 4;
+  inline float beta() const;
+  inline void set_beta(float value);
+
+  // optional float gamma = 5 [default = 0];
+  inline bool has_gamma() const;
+  inline void clear_gamma();
+  static const int kGammaFieldNumber = 5;
+  inline float gamma() const;
+  inline void set_gamma(float value);
+
+  // optional int32 resize = 6 [default = 0];
+  inline bool has_resize() const;
+  inline void clear_resize();
+  static const int kResizeFieldNumber = 6;
+  inline ::google::protobuf::int32 resize() const;
+  inline void set_resize(::google::protobuf::int32 value);
+
+  // optional int32 elastic_freq = 7 [default = 0];
+  inline bool has_elastic_freq() const;
+  inline void clear_elastic_freq();
+  static const int kElasticFreqFieldNumber = 7;
+  inline ::google::protobuf::int32 elastic_freq() const;
+  inline void set_elastic_freq(::google::protobuf::int32 value);
+
+  // optional float norm_a = 8 [default = 1];
+  inline bool has_norm_a() const;
+  inline void clear_norm_a();
+  static const int kNormAFieldNumber = 8;
+  inline float norm_a() const;
+  inline void set_norm_a(float value);
+
+  // optional float norm_b = 9 [default = 0];
+  inline bool has_norm_b() const;
+  inline void clear_norm_b();
+  static const int kNormBFieldNumber = 9;
+  inline float norm_b() const;
+  inline void set_norm_b(float value);
+
+  // @@protoc_insertion_point(class_scope:singa.MnistProto)
+ private:
+  inline void set_has_kernel();
+  inline void clear_has_kernel();
+  inline void set_has_sigma();
+  inline void clear_has_sigma();
+  inline void set_has_alpha();
+  inline void clear_has_alpha();
+  inline void set_has_beta();
+  inline void clear_has_beta();
+  inline void set_has_gamma();
+  inline void clear_has_gamma();
+  inline void set_has_resize();
+  inline void clear_has_resize();
+  inline void set_has_elastic_freq();
+  inline void clear_has_elastic_freq();
+  inline void set_has_norm_a();
+  inline void clear_has_norm_a();
+  inline void set_has_norm_b();
+  inline void clear_has_norm_b();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 kernel_;
+  float sigma_;
+  float alpha_;
+  float beta_;
+  float gamma_;
+  ::google::protobuf::int32 resize_;
+  ::google::protobuf::int32 elastic_freq_;
+  float norm_a_;
+  float norm_b_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(9 + 31) / 32];
+
+  friend void  protobuf_AddDesc_model_2eproto();
+  friend void protobuf_AssignDesc_model_2eproto();
+  friend void protobuf_ShutdownFile_model_2eproto();
+
+  void InitAsDefaultInstance();
+  static MnistProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class DropoutProto : public ::google::protobuf::Message {
+ public:
+  DropoutProto();
+  virtual ~DropoutProto();
+
+  DropoutProto(const DropoutProto& from);
+
+  inline DropoutProto& operator=(const DropoutProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const DropoutProto& default_instance();
+
+  void Swap(DropoutProto* other);
+
+  // implements Message ----------------------------------------------
+
+  DropoutProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from

<TRUNCATED>

[09/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest_main.cc
----------------------------------------------------------------------
diff --git a/include/gtest/gtest_main.cc b/include/gtest/gtest_main.cc
new file mode 100644
index 0000000..f302822
--- /dev/null
+++ b/include/gtest/gtest_main.cc
@@ -0,0 +1,38 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from gtest_main.cc\n");
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/cuda_reduce.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/cuda_reduce.cuh b/include/mshadow/cuda/cuda_reduce.cuh
new file mode 100644
index 0000000..b7808a6
--- /dev/null
+++ b/include/mshadow/cuda/cuda_reduce.cuh
@@ -0,0 +1,117 @@
+#ifndef MSHADOW_CUDA_REDUCE_CUH
+#define MSHADOW_CUDA_REDUCE_CUH
+/*!
+ * \file cuda_reduce.cuh
+ * \brief helper functions to do reduction
+ * \author Tianqi Chen
+ */
+namespace mshadow{
+    namespace cuda{
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam x_bits dimension = 1<<x_bits
+         */
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam xmax_bits maximum size of buffer
+         * \param xsize size of x dimension, not sure if aligned
+         */
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
+    };
+};
+
+// ===============================================x===
+//  implementations afterwards, 
+//  no need to read if only use the functions
+// --------------------------------------------------
+#ifdef  __DEVICE_EMULATION__
+#define __MSHADOW_EMUSYNC__ __syncthreads()
+#else
+#define __MSHADOW_EMUSYNC__ 
+#endif
+
+namespace mshadow{
+    namespace cuda{        
+        template<typename Reducer, int x_bits>
+        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
+            if( x_bits >= 10 ){
+                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 9 ){
+                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 8 ){
+                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 7 ){
+                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
+                __syncthreads(); 
+            }            
+            if( x_bits >= 6 ){
+                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
+                __syncthreads();
+            }
+            // in warp optimization
+            if( x_bits >= 5 ){
+                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 4 ){
+                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
+                __MSHADOW_EMUSYNC__;            
+            }
+            if( x_bits >= 3 ){
+                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 2 ){
+                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 1 ){
+                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
+                __MSHADOW_EMUSYNC__;
+            }  
+        };
+        
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
+            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
+        }
+
+        // reduce with a upper bound
+        #define __RD_NON_ALIGN(els,x_bits)                              \
+            els                                                         \
+            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
+                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
+                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
+                }                                                       \
+                __syncthreads();                                        \
+                ReduceX<Reducer, x_bits>( buf, tid );                   \
+            }                                                           \
+            
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
+            int tid = threadIdx.x;
+            __RD_NON_ALIGN(, 8)
+            __RD_NON_ALIGN(else, 7)
+            __RD_NON_ALIGN(else, 6)
+            __RD_NON_ALIGN(else, 5) 
+            __RD_NON_ALIGN(else, 4) 
+            __RD_NON_ALIGN(else, 3) 
+            __RD_NON_ALIGN(else, 2) 
+            __RD_NON_ALIGN(else, 1)                     
+        }
+    };
+};
+
+#endif // MSHADOW_CUDA_REDUCE_CUH
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/tensor_gpu-inl.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh b/include/mshadow/cuda/tensor_gpu-inl.cuh
new file mode 100644
index 0000000..61e477c
--- /dev/null
+++ b/include/mshadow/cuda/tensor_gpu-inl.cuh
@@ -0,0 +1,231 @@
+#ifndef MSHADOW_TENSOR_GPU_INL_CUH
+#define MSHADOW_TENSOR_GPU_INL_CUH
+/*!
+ * \file tensor_gpu-inl.cuh
+ * \brief implementation of GPU code using CUDA
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "../tensor.h"
+#include "cuda_reduce.cuh"
+
+namespace mshadow{
+    namespace cuda{
+        #ifndef __CUDA_ARCH__
+        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0"
+        #endif
+        /* load unit for memory access */
+        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
+        const int kMemUnitBits = 5;
+        const int kMaxThreadsPerBlock = 1024;
+        #else
+        const int kMemUnitBits = 4;
+        const int kMaxThreadsPerBlock = 512;
+        #endif
+        /*! \brief number of units that can do synchronized update, half warp size */
+        const int kMemUnit     = 1 << kMemUnitBits;
+        /*! \brief mask that could be helpful sometime */
+        const int kMemUnitMask = kMemUnit - 1;
+        /*! \brief suggested thread number(logscale) for mapping kernel */
+        const int kBaseThreadBits = 8;
+        /*! \brief suggested thread number for mapping kernel */
+        const int kBaseThreadNum  = 1 << kBaseThreadBits;
+        /*! \brief maximum value of grid */
+        const int kMaxGridNum     = 65535;
+        /*! \brief suggested grid number for mapping kernel */
+        const int kBaseGridNum    = 1024;
+        
+        /*! \brief get align stride for given size in x dimension */
+        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
+            if( (xstride & (kMemUnit-1)) == 0 ){
+                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
+            }else{
+                // if originally space is not aligned, no necessary to to alligned thread allocation
+                return xsize;
+            }
+        }
+        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){
+            if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock ||
+                dimGrid.x > 65535 || dimGrid.y > 65535 ){
+                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z );
+                utils::Error( "too large launch parameter\n");
+            } 
+        }        
+    };
+
+    namespace cuda {
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){
+            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
+            const int y   = tid / xstride;
+            const int x   = tid % xstride;
+            if (y < dst.shape[1] && x < dst.shape[0]) {
+                Saver::Save(dst[y][x], exp.Eval(y,x));
+            }
+        }
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){
+            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x );
+        }
+        template<typename Saver, typename Plan, int block_dim_bits, int grid_size>
+        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){
+            for( int i = 0; i < repeat; ++i ){
+                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size );
+            }
+        }        
+        
+        template<typename Saver, typename E>
+        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
+            const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ );
+            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum;
+            dim3 dimBlock(kBaseThreadNum, 1, 1);
+
+            if (num_block < kMaxGridNum) {
+                dim3 dimGrid(num_block, 1, 1);
+                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
+            } else {
+                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
+                dim3 dimGrid( kBaseGridNum, 1 , 1 );
+                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
+            }
+        }        
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
+        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){
+            const unsigned warp_size = 1 << warp_bits;
+            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
+            // to avoid bank conflict
+            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
+
+            // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
+            if( threadIdx.y < eshape[1] && x < eshape[0] ){
+                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x );
+            }
+            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
+                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
+                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) );
+                }
+            } 
+            __syncthreads();
+            if( eshape[1] >= warp_size ){
+                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
+            }else{
+                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] );
+            }
+            __syncthreads();            
+            
+            if( threadIdx.y == 0 && x < eshape[0] ){
+                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
+            } 
+        }        
+        
+        template<typename Saver, typename Reducer, typename E>
+        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
+            dim3 dimBlock( kMemUnit, kMemUnit );
+            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
+            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
+            MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape );
+        } 
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int block_dim_bits, typename Plan>
+        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){
+            const int block_size = 1 << block_dim_bits;
+            __shared__ real_t s_rec[ block_size ];
+            const int c = blockIdx.x;            
+            const index_t tot = pshape[0]*pshape[1]*pshape[3];
+
+            real_t res = Reducer::kInitV;
+            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){
+                index_t i = i_offset + threadIdx.x;
+                if( i< tot ){
+                    const index_t x = i % pshape[0];
+                    i /= pshape[0]; 
+                    const index_t y = i % pshape[1];
+                    const index_t n = i / pshape[1];
+                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
+                }
+            }                
+            s_rec[ threadIdx.x ] = res;
+            __syncthreads();
+            Reduce1D<Reducer,block_dim_bits>( s_rec );
+            if( threadIdx.x == 0 ){
+                Saver::Save( dst[c], s_rec[0]*scale );
+            }
+        }
+
+        template<typename Saver, typename Reducer, typename Plan>
+        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){  
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[0] );
+            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
+            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
+                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
+        }
+    };
+    
+    namespace cuda{
+        template<int x_bits>        
+        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
+            const unsigned x_size = 1 << x_bits;  
+            const int y = blockIdx.x;
+            __shared__ real_t s_rec[ x_size ];
+            
+            // step 1: get max
+            if( threadIdx.x < dst.shape[ 0 ] ){
+                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
+            }
+            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t a = src[ y ][ x + threadIdx.x ];
+                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
+                }
+            }
+            __syncthreads();
+            if( threadIdx.x >= dst.shape[0] ){
+                s_rec[ threadIdx.x ] = s_rec[0];
+            }
+            __syncthreads();
+            Reduce1D<red::maximum,x_bits>( s_rec );
+            __syncthreads();
+            real_t smax = s_rec[0];            
+            __syncthreads();
+            s_rec[ threadIdx.x ] = 0.0f;
+            __syncthreads();
+
+            // calculate normalizer, with writeback
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
+                    s_rec[ threadIdx.x ] += p;
+                    // write back first, will fetch later
+                    dst[ y ][ x + threadIdx.x ] = p;
+                }
+            }
+            // calculate normalizer
+            __syncthreads();
+            Reduce1D<red::sum,x_bits>( s_rec );
+            __syncthreads();
+            real_t ssum = s_rec[0];
+
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    dst[ y ][ x + threadIdx.x ] /= ssum;
+                }
+            }
+        }
+    
+        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[1] );
+            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
+            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
+            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
+        }
+    }; // namespace cuda
+}; // namespace mshadow
+#endif // TENSOR_GPU_INL_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cxxnet_op.h
----------------------------------------------------------------------
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
new file mode 100644
index 0000000..930caf2
--- /dev/null
+++ b/include/mshadow/cxxnet_op.h
@@ -0,0 +1,116 @@
+#ifndef CXXNET_OP_H
+#define CXXNET_OP_H
+#pragma once
+/*!
+ * \file cxxnet_op.h
+ * \brief extra mshadow operation for cxxnet
+ * \author Bing Xu
+ */
+#include "mshadow/tensor.h"
+
+namespace mshadow {
+    /*! \brief operations for algorithm */
+    namespace op {
+        struct sigmoid {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct sigmoid_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * ( 1.0f - a );
+            }
+        };
+
+        /*! \brief Rectified Linear Operation */
+        struct relu {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                using namespace std;
+                return max( a, 0.0f );
+            }
+        };
+        struct relu_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? 1.0f : 0.0f;
+            }
+        };
+
+        struct tanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return tanhf( a );
+            }
+        };
+        struct tanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f - a * a;
+            }
+        };
+        struct softplus {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return logf(1 + expf(a));
+            }
+        };
+        struct softplus_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct bnll {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + expf(a));
+            }
+        };
+        struct bnll_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 50.0f
+                expval = expf(-expval);
+                return 1.0f / (1.0f + expval);
+            }
+        };
+
+        struct square {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * a;
+            }
+        };
+       /*! \brief scaled tanh, hard code the scale factor*/
+        struct stanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+              return  1.7159047*tanhf(0.66666667 *a);
+            }
+        };
+        /*! \breif back prop for scaled tanh: */
+        struct stanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a;
+            }
+        };
+
+    }; //namespace op
+
+}; //namespace mshadow
+
+namespace mshadow {
+    namespace op {
+        /*! \brief used for generate Bernoulli mask */
+        struct threshold {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a < b ? 1.0f : 0.0f;
+            }
+        };
+
+        /*! \brief used for generate element of power */
+        struct power {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return powf( a, b );
+            }
+        };
+        struct sqrtop {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return sqrt(a+b);
+            }
+        };
+    }; // namespace op
+}; // namespace mshadow
+
+#endif // CXXNET_OP_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
new file mode 100644
index 0000000..42d13d3
--- /dev/null
+++ b/include/mshadow/tensor.h
@@ -0,0 +1,472 @@
+#ifndef MSHADOW_TENSOR_H
+#define MSHADOW_TENSOR_H
+/*!
+ * \file tensor.h
+ * \brief header file of tensor data structure and functions
+ *        covention: this lib requires explicit memory allocation and de-allocation
+ *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
+ *                   no memory allocation is happening during calculation
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "tensor_base.h"
+#include "tensor_expr.h"
+
+namespace mshadow {
+    /*!
+     * \brief shape of a tensor
+     *       IMPORTANT NOTE: this shape is different from numpy.shape
+     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
+     *       shape[k] corresponds to k-th dimension of tensor
+     * \tparam dimension dimension of tensor
+     */
+    template<int dimension>
+    struct Shape {
+    public:
+        /*! \brief maximum dimension of tensor */
+        const static int kMaxShape = dimension;
+        /*! \brief maximum dimension minus 1 */
+        const static int kSubShape = dimension - 1;
+    public:
+        /*! \brief default constructor, do nothing */
+        MSHADOW_XINLINE Shape(void) {}
+        /*! \brief constuctor */
+        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
+            #pragma unroll
+            for( int i = 0; i < kMaxShape; ++i ){
+                this->shape_[i] = s[i];
+            }
+            this->stride_ = s.stride_;
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE index_t& operator[](index_t idx) {
+            return shape_[ idx ];
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
+            return shape_[ idx ];
+        }
+        /*! \return whether two shape equals */
+        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
+            #pragma unroll
+            for ( int i = 0; i < kMaxShape; ++i ) {
+                if (s.shape_[i] != this->shape_[i]) return false;
+            }
+            return true;
+        }
+        /*!
+         * flatten the higher dimension to second dimension, return a 2D shape
+         * \return the flat 2d shape
+         */
+        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
+            Shape<2> s;
+            s.stride_ = this->stride_;
+            s.shape_[ 0 ] = this->shape_[ 0 ];
+            index_t ymax = 1;
+
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                ymax *= this->shape_[ i ];
+            }
+            s.shape_[1] = ymax;
+            return s;
+        }
+        /*! \return number of valid elements */
+        MSHADOW_XINLINE size_t Size(void) const{
+            size_t memsz = this->shape_[ 0 ];
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*! \return memory size, including the aligned x dimension */
+        MSHADOW_XINLINE size_t MSize(void) const {
+            size_t memsz = this->stride_;
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*!
+         * \return product shape in [dimstart,dimend)
+         * \param dimstart start dimension
+         * \param dimend   end dimension
+         */
+        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
+            index_t num = 1;
+            #pragma unroll
+            for (int i = dimstart; i < dimend; ++i) {
+                num *= this->shape_[ i ];
+            }
+            return num;
+        }
+        /*!
+         * \brief get subshape
+         * \return subshape
+         */
+        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
+            Shape<kSubShape> s;
+            s.stride_ = this->stride_;
+            // for cuda
+            #pragma unroll
+            for (int i = 0; i < kSubShape; ++i) {
+                s.shape_[ i ] = this->shape_[ i ];
+            }
+            return s;
+        }
+
+    public:
+        /*! \brief storing the dimension information */
+        index_t shape_[ kMaxShape ];
+        /*!
+         * \brief storing the stride information in x dimension
+         *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
+         */
+        index_t stride_;
+    };
+    // useful construction functions to generate shape
+    /*!
+     * \brief construct a one dimension shape, stride will equal s0
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
+        Shape<1> s; s[0] = s0; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a two dimension shape, stride will equal s0
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
+        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a three dimension shape, stride will equal s0
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
+        Shape<3> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a four dimension shape, stride will equal s0
+     * \param s3 size of dimension 3
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
+        Shape<4> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
+        return s;
+    }
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief device name CPU */
+    struct cpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = true;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<0;
+    };
+    /*! \brief device name CPU */
+    struct gpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = false;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<1;
+    };
+
+    // more compact template
+    /*!
+     * \brief general tensor
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
+    public:
+        /*! \brief whether current type lies in cpu */
+        const static bool kDevCPU = Device::kDevCPU;
+        /*! \brief dimension of subtype */
+        const static int  kSubdim = dimension - 1;
+
+    public:
+        /*! \brief pointer to the data */
+        real_t *dptr;
+        /*! \brief shape of the tensor */
+        Shape<dimension> shape;
+    public:
+        /*! \brief default constructor */
+        MSHADOW_XINLINE Tensor(void) {}
+        /*! \brief constructor from shape  */
+        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
+        /*! \brief constructor from data pointer and shape  */
+        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
+        /*!
+         * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
+         * \return tensor after flatten
+         */
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        /*!
+         * \brief get a element of dimension - 1
+         * \param idx index
+         * \return the result tensor
+         */
+        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
+            Shape<kSubdim> s = shape.SubShape();
+            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
+                                           (dptr) + s.MSize() * idx, s);
+        }
+        /*!
+         * \brief slice the tensor in highest dimension [begin,end)
+         * \param begin begin position of slice
+         * \param end end position of slice
+         * \return tensor after slice
+         */
+        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
+            Shape<dimension> s = this->shape;
+            s[ dimension - 1 ] = end - begin;
+            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
+                                             (dptr) + s.SubShape().MSize() * begin, s);
+        }
+    public:
+        /*!\brief functions to fit expression template */
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+
+    /*
+     *  respecialized class Tensor1D,thei is due to different implementation in operator[]
+     */
+    template<typename Device>
+    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
+    public:
+        real_t *dptr;
+        Shape<1> shape;
+    public:
+        MSHADOW_XINLINE Tensor(void) {}
+        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
+        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}
+
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
+            Shape<1> s;
+            s[0] = s.stride_ = end  - begin;
+            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
+                                     (dptr) + begin, s);
+        }
+        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
+        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
+    public:
+        // functions to fit expression template
+        inline Tensor<Device,1>& operator=( double s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+}; // namespace mshadow
+
+// add unroll loops for the shape
+namespace mshadow {
+    // function declarations
+    /*!
+     * \brief initialize tensor engine, used to call intialization functions of dependent libs
+     *        this function should be called before all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     * \param device_id GPU device id to be choosed
+     */
+    inline void InitTensorEngine( int device_id=0 );
+    /*!
+     * \brief Shutdown tensor engine,
+     *        this function should be called after all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     */
+    inline void ShutdownTensorEngine( void );
+
+    /*!
+     * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
+     *        this function is responsible to set the stride_ in each obj.shape
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object, with shape specified
+     * \param pad whether padding dimension 0, to make last dimension aligned,
+     *            padding may help improve efficiency of matrix multiplications
+     *            if true, will allocate space with stride_ that may not equals shape[0]
+     *            if false, will allocate continuous space
+     */
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+    /*! \brief refer to comment of cpu ver \sa AllocSpace */
+    template<int dim>
+    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object
+     */
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj);
+    /*! \brief refer to comment of cpu ver \sa FreeSpace */
+    template<int dim>
+    inline void FreeSpace(Tensor<gpu,dim> &obj);
+
+    /*!
+     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
+     * \tparam Device device of tensor
+     * \tparam dim dimention of tensor
+     * \param shape: shape of tensor
+     * \param initv: initialization value
+     * \param pad : padding option
+     * \sa AllocSpace
+     */
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief copy data from one tensor to another, with same shape
+     * \tparam dim specify the dim of tensor
+     * \param dst target tensor
+     * \param src source tensor
+     */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
+
+
+    /*!
+     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
+     * \param dst destination
+     * \param energy input energy
+     */
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
+    /*! \brief refer to comment of cpu ver \sa Softmax */
+    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
+
+}; // namespace mshadow
+
+
+namespace mshadow{
+    // function declarations to support expression, no need to understand them
+    // these functions do not need to be directly used
+
+    /*!
+     * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
+     * \tparam Saver specify storage method
+     * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+     */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
+    /*! \brief refer to comment of cpu ver \sa MapExp */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+};// namespace mshadow
+
+// execution implementation of expression evaluations
+#include "tensor_expr_engine-inl.hpp"
+// cpu implementation of functions
+#include "tensor_cpu-inl.hpp"
+// gpu implementation of functions
+#include "tensor_gpu-inl.hpp"
+// extension of expressions
+#include "tensor_expr_ext.h"
+// io 
+#include "tensor_io.h"
+// container
+#include "tensor_container.h"
+// random number generator
+#include "tensor_random.h"
+#endif // TENSOR_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_base.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h
new file mode 100644
index 0000000..b251cba
--- /dev/null
+++ b/include/mshadow/tensor_base.h
@@ -0,0 +1,298 @@
+#ifndef MSHADOW_TENSOR_BASE_H
+#define MSHADOW_TENSOR_BASE_H
+/*!
+ * \file tensor_base.h
+ * \brief definitions of base types, macros functions
+ *
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cmath>
+#include <cstdio>
+#include <cfloat>
+#include <climits>
+#include <algorithm>
+// macro defintiions
+
+/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */
+#ifndef MSHADOW_STAND_ALONE
+    #define MSHADOW_STAND_ALONE 0
+#endif
+
+/*! \brief whether do padding during allocation */
+#ifndef MSHADOW_ALLOC_PAD
+    #define MSHADOW_ALLOC_PAD true
+#endif
+
+/*! 
+ * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation 
+ *        for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64
+ *        set it to 0 then we will always allocate padded memory
+ */
+#ifndef MSHADOW_MIN_PAD_RATIO
+    #define MSHADOW_MIN_PAD_RATIO 2
+#endif
+
+#if MSHADOW_STAND_ALONE
+   #define MSHADOW_USE_CBLAS 0
+   #define MSHADOW_USE_MKL   0
+   #define MSHADOW_USE_CUDA  0
+#endif
+
+/*! \brief use CBLAS for CBLAS */
+#ifndef MSHADOW_USE_CBLAS
+   #define MSHADOW_USE_CBLAS 0
+#endif
+/*! \brief use MKL for BLAS */
+#ifndef MSHADOW_USE_MKL
+   #define MSHADOW_USE_MKL   1
+#endif
+/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */
+#ifndef MSHADOW_USE_CUDA
+  #define MSHADOW_USE_CUDA   1
+#endif
+/*! \brief use single precition float */
+#ifndef MSHADOW_SINGLE_PRECISION
+  #define MSHADOW_SINGLE_PRECISION 1
+#endif
+/*! \brief whether use SSE */
+#ifndef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 1
+#endif
+/*! \brief whether use NVML to get dynamic info */
+#ifndef MSHADOW_USE_NVML
+  #define MSHADOW_USE_NVML 0
+#endif
+// SSE is conflict with cudacc
+#ifdef __CUDACC__
+  #undef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 0
+#endif
+
+#if MSHADOW_USE_CBLAS
+extern "C"{
+    #include <cblas.h>
+}
+#elif MSHADOW_USE_MKL
+  #include <mkl.h>
+  #include <mkl_cblas.h>
+  #include <mkl_vsl.h>
+  #include <mkl_vsl_functions.h>
+#endif
+
+#if MSHADOW_USE_CUDA
+  #include <cublas.h>
+  #include <curand.h>
+#endif
+
+#if MSHADOW_USE_NVML
+  #include <nvml.h>
+#endif
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code.
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__
+#else
+  #define MSHADOW_XINLINE inline __attribute__((always_inline))
+#endif
+/*! \brief cpu force inline */
+#define MSHADOW_CINLINE inline __attribute__((always_inline))
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+  #define MSHADOW_CONSTEXPR constexpr
+#else
+  #define MSHADOW_CONSTEXPR const
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+    /*! \brief buffer size for each random number generator */
+    const unsigned kRandBufferSize = 1000000;
+    /*! \brief pi  */
+    const float kPi = 3.1415926f;
+
+#if MSHADOW_SINGLE_PRECISION
+    /*! \brief type that will be used for content */
+    typedef float real_t;
+#else
+    typedef double real_t;
+#endif
+    /*! \brief type that will be used for index */
+    typedef unsigned index_t;
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief namespace for operators */
+    namespace op {
+        // binary operator
+        /*! \brief mul operator */
+        struct mul{
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a * b;
+            }
+        };
+        /*! \brief plus operator */
+        struct plus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a + b;
+            }
+        };
+        /*! \brief minus operator */
+        struct minus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a - b;
+            }
+        };
+        /*! \brief divide operator */
+        struct div {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a / b;
+            }
+        };
+        /*! \brief get rhs */
+        struct right {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return b;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for savers */
+    namespace sv {
+        /*! \brief save to saver: = */
+        struct saveto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a  = b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::right OPType;
+        };
+        /*! \brief save to saver: += */
+        struct plusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a += b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::plus OPType;
+        };
+        /*! \brief minus to saver: -= */
+        struct minusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a -= b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::minus OPType;
+        };
+        /*! \brief multiply to saver: *= */
+        struct multo {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a *= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::mul OPType;
+        };
+        /*! \brief divide to saver: /= */
+        struct divto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a /= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::div OPType;
+        };
+    }; // namespace sv
+
+
+    namespace op {
+        // unary operator/ function: example
+        // these operators can be defined by user, in the same style as binary and unary operator
+        // to use, simply write F<op::identity>( src )
+        /*! \brief identity function that maps a real number to it self */
+        struct identity{
+            /*! \brief map a to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for potential reducer operations */
+    namespace red {
+        /*! \brief sum reducer */
+        struct sum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
+                dst += src;
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
+                return 1.0f;
+            }
+            /*! \brief an intial value of reducer */
+            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
+        };
+        /*! \brief maximum reducer */
+        struct maximum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
+                using namespace std;
+                dst = max( dst, src );
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
+                return redres == redsrc ? 1.0f: 0.0f;
+            }
+            /*! \brief an intial value of reducer */
+#if MSHADOW_SINGLE_PRECISION
+            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
+#else
+            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
+#endif
+        };
+    };
+
+    /*! \brief namespace for helper utils of the project */
+    namespace utils{
+        /*! \brief send error message then exit */
+        inline void Error( const char *msg ){
+            fprintf( stderr, "Error:%s\n",msg );
+            exit( -1 );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp ){
+            if( !exp ) Error( "AssertError" );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp, const char *msg ){
+            if( !exp ) Error( msg );
+        }
+        /*! \brief warning */
+        inline void Warning( const char *msg ){
+            fprintf( stderr, "warning:%s\n",msg );
+        }
+    }; // namespace utils
+}; // namespace mshadow
+#endif // TENSOR_BASE_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_container.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_container.h b/include/mshadow/tensor_container.h
new file mode 100644
index 0000000..f0699e7
--- /dev/null
+++ b/include/mshadow/tensor_container.h
@@ -0,0 +1,152 @@
+#ifndef MSHADOW_TENSOR_CONTAINER_H
+#define MSHADOW_TENSOR_CONTAINER_H
+/*!
+ * \file tensor_container.h
+ * \brief tensor container that does memory allocation and resize like STL
+ * \author Tianqi Chen
+ */
+#include "tensor.h"
+#include "tensor_io.h"
+
+namespace mshadow{
+    /*!
+     * \brief tensor container that does memory allocation and resize like STL,
+     *        use it to save the lines of FreeSpace in class.
+     *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
+     *
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    class TensorContainer: public Tensor<Device,dimension>{
+    public:
+        /*! 
+         * \brief constructor 
+         * \param pad whether use padding alignment in space allocation
+         */
+        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
+            this->pad_ = pad;
+            this->dptr = data_.dptr = NULL;
+            this->shape[0] = 0;
+            this->shape.stride_ = 0;
+            this->data_.shape.stride_ = 0;
+            this->data_.shape[1] = 0;
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         */
+        TensorContainer( const Shape<dimension> &shape ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         * \param initv intial value
+         */
+        TensorContainer( const Shape<dimension> &shape, real_t initv ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+            (*this) = initv;
+        }
+        ~TensorContainer( void ){
+            this->FreeSpace();
+        }
+        /*! 
+         * \brief resize the container to given shape, content is NOT preserved
+         * \param shape target shape
+         */
+        inline void Resize( const Shape<dimension> &shape ){
+            Shape<2> s2 = shape.FlatTo2D();            
+            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){
+                this->AllocByShape( shape );
+            }else{
+                this->shape = shape;
+                if( this->pad_ ){
+                    this->shape.stride_ = data_.shape.stride_;
+                }else{
+                    this->shape.stride_ = this->shape[ 0 ];
+                }
+            }
+        }
+        /*! 
+         * \brief resize the container to given shape, and initialize, content is NOT preserved
+         * \param shape target shape
+         * \param initv initialization value
+         */
+        inline void Resize( const Shape<dimension> &shape, real_t initv ){
+            this->Resize( shape );
+            (*this) = initv;
+        }
+        /*! \brief set whether padding is allowed in tensor */
+        inline void set_pad( bool pad ){
+            this->pad_ = pad;
+        }
+        /*! 
+         * \brief save by binary format
+         * \param fo output binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void SaveBinary( TStream &fo ) const{
+            mshadow::SaveBinary( fo, *this );
+        }
+        /*! 
+         * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
+         * \param fi input binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void LoadBinary( TStream &fi ) {
+            Tensor<cpu,dimension> tmp;
+            mshadow::LoadBinary( fi, tmp, false );
+            this->Resize( tmp.shape );
+            Copy( *this, tmp );
+            mshadow::FreeSpace( tmp );
+        }
+    public:
+        // functions to fit exp template
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    private:
+        /*! \brief whether we do padding in the space */
+        bool pad_;
+        /*! \brief the shape of data_ is actually current data space */
+        Tensor<Device, 2> data_;
+    private:
+        inline void FreeSpace (void){
+            if( data_.dptr != NULL ){
+                mshadow::FreeSpace( data_ );
+                data_.dptr = this->dptr = NULL;
+            }
+        }
+        inline void AllocByShape (const Shape<dimension>& shape){
+            if( data_.dptr != NULL ){
+                this->FreeSpace();
+            }
+            data_.shape = shape.FlatTo2D();
+            mshadow::AllocSpace( data_, pad_ );
+            this->dptr  = data_.dptr;
+            this->shape = shape;
+            if( this->pad_ ){
+                this->shape.stride_ = data_.shape.stride_;
+            }else{
+                this->shape.stride_ = shape[0];
+            }
+        }
+    };
+};// namespace mshadow
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_cpu-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_cpu-inl.hpp b/include/mshadow/tensor_cpu-inl.hpp
new file mode 100644
index 0000000..0fa3cfa
--- /dev/null
+++ b/include/mshadow/tensor_cpu-inl.hpp
@@ -0,0 +1,168 @@
+#ifndef MSHADOW_TENSOR_CPU_INL_HPP
+#define MSHADOW_TENSOR_CPU_INL_HPP
+/*!
+ * \file tensor_cpu-inl.hpp
+ * \brief implementation of CPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cstring>
+#include "tensor_base.h"
+#include "tensor_sse-inl.hpp"
+
+namespace mshadow {
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
+        size_t pitch;
+        if( pad ){
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
+            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
+        }else{
+            obj.shape.stride_ = obj.shape[0];
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
+        }
+    }
+
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){
+        Tensor<Device, dim> obj( shape );
+        AllocSpace( obj, pad );
+        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
+        return obj;
+    }
+
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj){
+        sse2::AlignedFree( obj.dptr );
+        obj.dptr = NULL;
+    }
+
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
+        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        Tensor<cpu,2> src = _src.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
+        }
+    }
+
+    template<typename Saver, typename E, int dim>
+    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            for (index_t x = 0; x < dst.shape[0]; ++x ) {
+                // trust your compiler! -_- they will optimize it
+                Saver::Save(dst[y][x], plan.Eval( y, x ) );
+            }
+        }
+    }
+
+    // code to handle SSE optimization
+    template<bool pass_check,typename Saver, int dim, typename E, int etype>
+    struct MapExpCPUEngine;
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<false,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+            MapPlan<SV>( dst, MakePlan( exp.self() ) );
+        }
+    };
+
+    #if MSHADOW_USE_SSE
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<true,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+            using namespace expr;
+            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){
+                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
+            }else{
+                MapPlan<SV>( dst, MakePlan( exp.self() ) );
+            }
+        }
+    };
+    #endif
+
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
+        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
+        #if MSHADOW_USE_SSE
+        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp );
+        #else
+        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
+        #endif
+    }
+
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
+
+        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
+        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+        for( index_t x = 0; x < eshape[0]; ++x ){
+            real_t res = plan.Eval( 0, x );
+            for( index_t y = 1; y < eshape[1]; ++y ){
+                Reducer::Reduce( res, plan.Eval( y, x ) );
+            }
+            Saver::Save( dst[x], res*scale );
+        }
+    }
+
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        typedef Shape< ExpInfo<E>::kDim > EShape;
+        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
+        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
+        // use equvalent form
+        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
+                                  eshape.ProdShape(1,dimkeep), eshape[0] );
+
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+
+        for( index_t c = 0; c < pshape[2]; ++c ){
+            real_t res = Reducer::kInitV;
+            for( index_t n = 0; n < pshape[3]; ++n ){
+                real_t tres = Reducer::kInitV;
+                for( index_t y = 0; y < pshape[1]; ++y ){
+                    for( index_t x = 0; x < pshape[0]; ++x ){
+                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
+                    }
+                }
+                Reducer::Reduce( res, tres );
+            }
+            Saver::Save( dst[c], res*scale );
+        }
+    }
+
+    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
+        real_t mmax = energy[0];
+        for( real_t x = 1; x < dst.shape[0]; ++x )
+            if( mmax < energy[x] ) mmax = energy[x];
+        real_t sum = 0.0f;
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] = std::exp( energy[x] - mmax );
+            sum += dst[x];
+        }
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] /= sum;
+        }
+    }
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
+        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
+        for( index_t y = 0; y < dst.shape[1]; ++y ){
+            Softmax( dst[y], energy[y] );
+        }
+    }
+}; // namespace mshadow
+
+#endif // TENSOR_CPU_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h
new file mode 100644
index 0000000..ac8fde7
--- /dev/null
+++ b/include/mshadow/tensor_expr.h
@@ -0,0 +1,367 @@
+#ifndef MSHADOW_TENSOR_EXPR_H
+#define MSHADOW_TENSOR_EXPR_H
+/*!
+ * \file tensor_expr.h
+ * \brief definitions of abstract expressions and expressions template
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_base.h"
+
+namespace mshadow{
+    /*!
+     * \brief namespace for abstract expressions and expressions template,
+     *        have no dependecy on tensor.h,
+     *        These data structure takes no charge in computations,
+     *        they are only used to define operations and represent expression in a symbolic way
+     */
+    namespace expr{
+
+        /*! \brief type of expressions */
+        namespace type{
+            /*! \brief this expression directly correspnds to a data class */
+            const int kContainer = 0;
+            /*! \brief this only contains element-wise vector operations */
+            const int kMapper    = 1;
+            /*! \brief othercase: e.g dot product */
+            const int kComplex   = 3;
+        };
+
+        /*!
+         * \brief expression engine that actually interprets these expressions
+         *        this is a function template that needed to be implemented for specific expressions
+         */
+        template<typename Saver,typename Container>
+        struct ExpEngine{
+            template<typename EType>
+            inline static void Eval( Container& dst, const EType &exp );
+        };
+
+        template<typename Container>
+        class ContainerExp;
+        class ScalarExp;
+
+        /*!
+         * \brief base class for expression
+         * \tparam SubType inheritated class must put their type into this parameter
+         * \tparam exp_type expression type, see namespace type
+         */
+        template<typename SubType, int exp_type>
+        struct Exp{
+        public:
+            /*! \return  subtype instance of current class */
+            inline const SubType& self( void ) const{
+                return *static_cast<const SubType*>(this);
+            }
+            /*! \return reference of subtype instance of current class */
+            inline SubType& refself( void ){
+                return *static_cast<SubType*>(this);
+            }
+        };
+
+        /*! \brief scalar expression */
+        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
+            /*! \brief scalar value */
+            real_t scalar_;
+            /*! \brief constructor */
+            ScalarExp( real_t scalar ):scalar_(scalar){}
+        };
+
+        /*! \brief represent a transpose expression of a container */
+        template<typename EType>
+        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
+        public:
+            /*! \brief expression to be transposed */
+            const EType &exp;
+            /*! \brief constructor */
+            TransposeExp( const EType &e ):exp(e){}
+            /*! \brief transpose expression */
+            inline const EType & T( void ) const{
+                return exp;
+            }
+        };
+        
+        /*!
+         * \brief base class of all variables, that can be assigned to values
+         * \tparam Container the actually class of data container, e.g. CTensor1D
+         */
+        template<typename Container>
+        class ContainerExp: public Exp< Container, type::kContainer >{
+        public:
+            /*!
+             *\brief transpose of a matrix
+             *\return transpose of current expression
+             */
+            inline const TransposeExp<Container> T( void ) const{
+                return TransposeExp<Container>( this->self() );
+            }
+        public:
+            /*! \brief operator overload */
+            inline Container &operator+=( real_t s ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator-=( real_t s ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator*=( real_t s ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator/=( real_t s ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &__assign( real_t s ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+        public:
+            /*! \brief implementation of operator=, note that we can not define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator=, note that we can not define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator+= */
+            template<typename E,int etype>
+            inline Container &operator+=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator-= */
+            template<typename E,int etype>
+            inline Container &operator-=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator*= */
+            template<typename E,int etype>
+            inline Container &operator*=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator/= */
+            template<typename E,int etype>
+            inline Container &operator/=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+        };
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam ltrans whether lhs is transposed
+         * \tparam rtrans whether rhs is transposed
+         */
+        template<typename TA,typename TB,bool ltrans,bool rtrans>
+        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief scale over result */
+            real_t scale_;
+            /*! \brief constructor */
+            DotExp( const TA &lhs, const TB &rhs, real_t scale )
+                :lhs_(lhs),rhs_(rhs),scale_(scale){}
+        };
+
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs );
+        }
+        /*! \brief scale of dot operation */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs );
+        }
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief binary map expression lhs [op] rhs
+         * \tparam OP operator
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, typename TB, int etype >
+        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief constructor */
+            BinaryMapExp( const TA &lhs, const TB &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
+         * \param lhs left operand
+         * \param rhs right operand
+         * \tparam binary operator 
+         * \tparam TA lhs expression
+         * \tparam ta lhs expression type
+         * \tparam TB rhs expression
+         * \tparam tb rhs expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TA, int ta>
+        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TB, int tb>
+        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+
+        // operator rules
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators 2
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+    };
+
+    namespace expr{
+        /*!
+         * \brief unary map expression op(src)
+         * \tparam OP operator
+         * \tparam TA type of src
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, int etype >
+        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
+            /*! \brief source expression */
+            const TA& src_;
+            /*! \brief constructor */
+            UnaryMapExp( const TA &src ):src_(src){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){
+            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
+         * \param src source expression
+         * \tparam operator 
+         * \tparam TA source expression
+         * \tparam ta source expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){
+            return MakeExp<OP>(src);
+        }
+    };
+};
+#endif


[12/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
Transfer code from nusinga repo to singa apache repo.
New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems.
Communication is now implmented using ZeroMQ. APIs are general to replace the implementation using MPI.
Tested on single node using examples in example/ folder. Todo test with other cluster configurations (e.g., multiple nodes).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b2dc51d2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b2dc51d2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b2dc51d2

Branch: refs/heads/master
Commit: b2dc51d2364593611e06ded3aaeadab8274fae13
Parents: 95b1e6d
Author: wang wei <wa...@comp.nus.edu.sg>
Authored: Sun May 3 21:57:08 2015 +0800
Committer: wang wei <wa...@comp.nus.edu.sg>
Committed: Sun May 3 21:57:08 2015 +0800

----------------------------------------------------------------------
 .gitignore                                 |    28 +
 LICENSE.md                                 |   864 +
 Makefile                                   |    90 +
 README.md                                  |     4 +-
 examples/cifar10/cluster.conf              |     3 +
 examples/cifar10/hostfile                  |    20 +
 examples/cifar10/model.conf                |   219 +
 examples/cifar10/run.sh                    |    50 +
 examples/mnist/batch.sh                    |    41 +
 examples/mnist/cluster.conf                |     3 +
 examples/mnist/conv.conf                   |   175 +
 examples/mnist/hostfile                    |     8 +
 examples/mnist/mlp.conf                    |   223 +
 examples/mnist/run.sh                      |    40 +
 include/communication/msg.h                |   187 +
 include/communication/socket.h             |   144 +
 include/gtest/gtest-all.cc                 |  9592 +++++++++++
 include/gtest/gtest.h                      | 20061 ++++++++++++++++++++++
 include/gtest/gtest_main.cc                |    38 +
 include/mshadow/cuda/cuda_reduce.cuh       |   117 +
 include/mshadow/cuda/tensor_gpu-inl.cuh    |   231 +
 include/mshadow/cxxnet_op.h                |   116 +
 include/mshadow/tensor.h                   |   472 +
 include/mshadow/tensor_base.h              |   298 +
 include/mshadow/tensor_container.h         |   152 +
 include/mshadow/tensor_cpu-inl.hpp         |   168 +
 include/mshadow/tensor_expr.h              |   367 +
 include/mshadow/tensor_expr_engine-inl.hpp |   416 +
 include/mshadow/tensor_expr_ext.h          |   978 ++
 include/mshadow/tensor_gpu-inl.hpp         |   148 +
 include/mshadow/tensor_io.h                |   137 +
 include/mshadow/tensor_random.h            |   299 +
 include/mshadow/tensor_sse-inl.hpp         |   431 +
 include/neuralnet/base_layer.h             |   563 +
 include/neuralnet/layer.h                  |   297 +
 include/neuralnet/neuralnet.h              |   156 +
 include/trainer/pm_server.h                |    91 +
 include/trainer/pm_worker.h                |   171 +
 include/trainer/server.h                   |    22 +
 include/trainer/trainer.h                  |    50 +
 include/trainer/worker.h                   |   218 +
 include/utils/blob.h                       |   166 +
 include/utils/cluster.h                    |   125 +
 include/utils/common.h                     |    51 +
 include/utils/data_shard.h                 |   145 +
 include/utils/factory.h                    |    57 +
 include/utils/graph.h                      |   150 +
 include/utils/param.h                      |   172 +
 include/utils/singleton.h                  |    41 +
 include/utils/updater.h                    |    78 +
 script/node.sh                             |    71 +
 src/communication/msg.cc                   |     5 +
 src/communication/socket.cc                |   118 +
 src/main.cc                                |    49 +
 src/neuralnet/base_layer.cc                |   194 +
 src/neuralnet/layer.cc                     |   781 +
 src/neuralnet/neuralnet.cc                 |   401 +
 src/proto/cluster.pb.h                     |   989 ++
 src/proto/cluster.proto                    |    45 +
 src/proto/model.pb.h                       |  8167 +++++++++
 src/proto/model.proto                      |   382 +
 src/test/dist_test/test_consistency.cc     |   406 +
 src/test/dist_test/test_core.cc            |   192 +
 src/test/dist_test/test_da.cc              |   700 +
 src/test/dist_test/test_dary.cc            |    85 +
 src/test/dist_test/test_disk_table.cc      |   188 +
 src/test/dist_test/test_mnistlayer.cc      |   165 +
 src/test/dist_test/test_model.cc           |    25 +
 src/test/dist_test/test_neuralnet.cc       |   141 +
 src/test/dist_test/test_pm.cc              |    88 +
 src/test/dist_test/test_router.cc          |    27 +
 src/test/dist_test/test_split.cc           |   304 +
 src/test/dist_test/test_table_server.cc    |   357 +
 src/test/dist_test/test_tuple.cc           |   258 +
 src/test/model/test_blob.cc                |    58 +
 src/test/model/test_data_layer.cc          |   178 +
 src/test/model/test_label_source.cc        |    59 +
 src/test/model/test_param.cc               |   138 +
 src/test/model/test_proto.cc               |    67 +
 src/test/model/test_rgb_dir_source.cc      |    63 +
 src/test/test_cluster.cc                   |    95 +
 src/test/test_communication.cc             |   158 +
 src/test/test_shard.cc                     |    56 +
 src/trainer/pm_server.cc                   |    99 +
 src/trainer/pm_worker.cc                   |   344 +
 src/trainer/server.cc                      |    68 +
 src/trainer/trainer.cc                     |   206 +
 src/trainer/worker.cc                      |   299 +
 src/utils/blob.cc                          |   330 +
 src/utils/cluster.cc                       |    52 +
 src/utils/common.cc                        |    89 +
 src/utils/data_shard.cc                    |   207 +
 src/utils/graph.cc                         |   148 +
 src/utils/param.cc                         |   345 +
 src/utils/updater.cc                       |   192 +
 95 files changed, 56061 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2d070a5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+*.orig
+*.swp
+*.o
+*.bin
+*.a
+*.so
+*.dat
+*~
+*.bak
+*.P
+*.odp
+*.project
+*.cproject
+*.log
+*.nfs*
+script/*
+!script/*.sh
+!script/*.awk
+src/test/data/*
+tmp
+log*
+build/
+tmp/
+include/proto/*.h
+src/proto/*.cc
+.sync
+*lmdb
+*.binaryproto

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/LICENSE.md
----------------------------------------------------------------------
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..5f43a73
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,864 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+============================================================================
+The SINGA project contains subcomponents with separate copyright
+notices and license terms. Your use of the source code for the these
+subcomponents is subject to the terms and conditions of the following
+licenses.
+
+============================================================================
+For cafee (see comments in src/neuralnet/layer.cc)
+===========================================================================
+COPYRIGHT
+All contributions by the University of California:
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+All other contributions:
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+LICENSE
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+CONTRIBUTION AGREEMENT
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
+
+=====================================================================
+For czmq
+=====================================================================
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+    means the combination of the Contributions of others (if any) used
+    by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+    means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+    means Source Code Form to which the initial Contributor has attached
+    the notice in Exhibit A, the Executable Form of such Source Code
+    Form, and Modifications of such Source Code Form, in each case
+    including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+    means
+
+    (a) that the initial Contributor has attached the notice described
+        in Exhibit B to the Covered Software; or
+
+    (b) that the Covered Software was made available under the terms of
+        version 1.1 or earlier of the License, but not also under the
+        terms of a Secondary License.
+
+1.6. "Executable Form"
+    means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+    means a work that combines Covered Software with other material, in
+    a separate file or files, that is not Covered Software.
+
+1.8. "License"
+    means this document.
+
+1.9. "Licensable"
+    means having the right to grant, to the maximum extent possible,
+    whether at the time of the initial grant or subsequently, any and
+    all of the rights conveyed by this License.
+
+1.10. "Modifications"
+    means any of the following:
+
+    (a) any file in Source Code Form that results from an addition to,
+        deletion from, or modification of the contents of Covered
+        Software; or
+
+    (b) any new file in Source Code Form that contains any Covered
+        Software.
+
+1.11. "Patent Claims" of a Contributor
+    means any patent claim(s), including without limitation, method,
+    process, and apparatus claims, in any patent Licensable by such
+    Contributor that would be infringed, but for the grant of the
+    License, by the making, using, selling, offering for sale, having
+    made, import, or transfer of either its Contributions or its
+    Contributor Version.
+
+1.12. "Secondary License"
+    means either the GNU General Public License, Version 2.0, the GNU
+    Lesser General Public License, Version 2.1, the GNU Affero General
+    Public License, Version 3.0, or any later versions of those
+    licenses.
+
+1.13. "Source Code Form"
+    means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+    means an individual or a legal entity exercising rights under this
+    License. For legal entities, "You" includes any entity that
+    controls, is controlled by, or is under common control with You. For
+    purposes of this definition, "control" means (a) the power, direct
+    or indirect, to cause the direction or management of such entity,
+    whether by contract or otherwise, or (b) ownership of more than
+    fifty percent (50%) of the outstanding shares or beneficial
+    ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+    Licensable by such Contributor to use, reproduce, make available,
+    modify, display, perform, distribute, and otherwise exploit its
+    Contributions, either on an unmodified basis, with Modifications, or
+    as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+    for sale, have made, import, and otherwise transfer either its
+    Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+    or
+
+(b) for infringements caused by: (i) Your and any other third party's
+    modifications of Covered Software, or (ii) the combination of its
+    Contributions with other software (except as part of its Contributor
+    Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+    its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+    Form, as described in Section 3.1, and You must inform recipients of
+    the Executable Form how they can obtain a copy of such Source Code
+    Form by reasonable means in a timely manner, at a charge no more
+    than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+    License, or sublicense it under different terms, provided that the
+    license for the Executable Form does not attempt to limit or alter
+    the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+*                                                                      *
+*  6. Disclaimer of Warranty                                           *
+*  -------------------------                                           *
+*                                                                      *
+*  Covered Software is provided under this License on an "as is"       *
+*  basis, without warranty of any kind, either expressed, implied, or  *
+*  statutory, including, without limitation, warranties that the       *
+*  Covered Software is free of defects, merchantable, fit for a        *
+*  particular purpose or non-infringing. The entire risk as to the     *
+*  quality and performance of the Covered Software is with You.        *
+*  Should any Covered Software prove defective in any respect, You     *
+*  (not any Contributor) assume the cost of any necessary servicing,   *
+*  repair, or correction. This disclaimer of warranty constitutes an   *
+*  essential part of this License. No use of any Covered Software is   *
+*  authorized under this License except under this disclaimer.         *
+*                                                                      *
+************************************************************************
+
+************************************************************************
+*                                                                      *
+*  7. Limitation of Liability                                          *
+*  --------------------------                                          *
+*                                                                      *
+*  Under no circumstances and under no legal theory, whether tort      *
+*  (including negligence), contract, or otherwise, shall any           *
+*  Contributor, or anyone who distributes Covered Software as          *
+*  permitted above, be liable to You for any direct, indirect,         *
+*  special, incidental, or consequential damages of any character      *
+*  including, without limitation, damages for lost profits, loss of    *
+*  goodwill, work stoppage, computer failure or malfunction, or any    *
+*  and all other commercial damages or losses, even if such party      *
+*  shall have been informed of the possibility of such damages. This   *
+*  limitation of liability shall not apply to liability for death or   *
+*  personal injury resulting from such party's negligence to the       *
+*  extent applicable law prohibits such limitation. Some               *
+*  jurisdictions do not allow the exclusion or limitation of           *
+*  incidental or consequential damages, so this exclusion and          *
+*  limitation may not apply to You.                                    *
+*                                                                      *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+  This Source Code Form is subject to the terms of the Mozilla Public
+  License, v. 2.0. If a copy of the MPL was not distributed with this
+  file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+  This Source Code Form is "Incompatible With Secondary Licenses", as
+  defined by the Mozilla Public License, v. 2.0.
+
+======================================================================
+For gflags:
+======================================================================
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+    * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from this
+    software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+======================================================================
+For glog:
+======================================================================
+Copyright (c) 2008, Google Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+* Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+A function gettimeofday in utilities.cc is based on
+http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd
+The license of this code is:
+Copyright (c) 2003-2008, Jouni Malinen <j...@w1.fi> and contributors
+All Rights Reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+3. Neither the name(s) of the above-listed copyright holder(s) nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=====================================================================
+For lmdb
+=====================================================================
+The OpenLDAP Public License
+  Version 2.8, 17 August 2003
+
+  Redistribution and use of this software and associated documentation
+  ("Software"), with or without modification, are permitted provided
+  that the following conditions are met:
+
+  1. Redistributions in source form must retain copyright statements
+     and notices,
+
+  2. Redistributions in binary form must reproduce applicable copyright
+  statements and notices, this list of conditions, and the following disclaimer
+  in the documentation and/or other materials provided with the distribution,
+  and
+
+  3. Redistributions must contain a verbatim copy of this document.
+
+  The OpenLDAP Foundation may revise this license from time to
+  time.
+  Each revision is distinguished by a version number.  You may use
+  this Software under terms of this license revision or under the
+  terms of any subsequent revision of the license.
+
+  THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS
+  CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  MERCHANTABILITY
+  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT
+  SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S)
+  OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+  The names of the authors and copyright holders must not be used
+  in
+  advertising or otherwise to promote the sale, use or other
+  dealing
+  in this Software without specific, written prior permission.
+  Title
+  to copyright in this Software shall at all times remain with
+  copyright
+  holders.
+
+  OpenLDAP is a registered trademark of the OpenLDAP Foundation.
+
+  Copyright 1999-2003 The OpenLDAP Foundation, Redwood City,
+  California, USA.  All Rights Reserved.  Permission to copy and
+  distribute verbatim copies of this document is granted.
+
+======================================================================
+For protobuf:
+=====================================================================
+Copyright 2008, Google Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+* Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it. This code is not
+standalone and requires a support library to be linked with it. This
+support library is itself covered by the above license.
+
+========================================================================
+For OpenCV used in preprocessing images
+=======================================================================
+License Agreement
+For Open Source Computer Vision Library
+(3-clause BSD License)
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+Neither the names of the copyright holders nor the names of the contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+This software is provided by the copyright holders and contributors “as is”
+and any express or implied warranties, including, but not limited to, the
+implied warranties of merchantability and fitness for a particular purpose are
+disclaimed. In no event shall copyright holders or contributors be liable for
+any direct, indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused and on
+any theory of liability, whether in contract, strict liability, or tort
+(including negligence or otherwise) arising in any way out of the use of this
+software, even if advised of the possibility of such damage.
+
+
+========================================================================
+For Openblas
+=======================================================================
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  1. Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+  3. Neither the name of the OpenBLAS project nor the names of its contributors
+  may be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/Makefile
----------------------------------------------------------------------
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d29c933
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,90 @@
+###################User Config Varaibles #############################
+HOME_DIR := /usr/
+# Lib folder for system and external libs. You may need to change it.
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib
+# Header folder for system and external libs. You may need to change it.
+INCLUDE_DIRS := $(HOME_DIR)/include ./include
+# g++ location, should support c++11, tested with 4.8.1
+CXX := g++
+
+######################Setting Varialbes#######################################
+LIBRARIES := glog gflags protobuf rt opencv_highgui opencv_imgproc opencv_core\
+	lmdb openblas zmq czmq
+
+LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
+	$(foreach library, $(LIBRARIES), -l$(library))
+# Folder to store compiled files
+BUILD_DIR := build
+MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+CXXFLAGS := -O2 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
+	$(MSHADOW_FLAGS) -DCPU_ONLY=1 \
+	-funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+
+# find user defined .proto file, and then compute the corresponding .h, .cc
+# files, which cannot be found by shell find, because they haven't been
+# generated currently
+PROTOS := $(shell find src/proto/ -name "*.proto")
+PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
+PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
+PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
+
+# each singa src file will generate a .o file
+SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" \) \
+	-prune -o \( -name "*.cc" -type f \) -print )
+SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
+	$(PROTO_OBJS) )
+-include $(SINGA_OBJS:%.o=%.P)
+
+TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
+TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
+-include $(TEST_OBJS:%.o=%.P)
+
+GTEST_SRC := include/gtest/gtest-all.cc
+GTEST_HDR := include/gtest/gtest.h
+GTEST_LIB := $(BUILD_DIR)/libgtest.a
+
+OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
+
+########################Compilation Section###################################
+.PHONY: singa test
+
+singa: $(PROTO_OBJS) $(SINGA_OBJS)
+	$(CXX) $(SINGA_OBJS) src/main.cc -o $(BUILD_DIR)/singa $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+loader: proto $(LOADER_OBJS)
+	$(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+test:  proto $(GTEST_LIB) $(TEST_OBJS) $(SINGA_OBJS)
+	$(CXX) $(TEST_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \
+		$(SINGA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
+	$(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
+	ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
+
+# compile all files
+$(OBJS):$(BUILD_DIR)/%.o : %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $<  $(CXXFLAGS) -MMD -c -o $@
+	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
+	rm -f $*.d
+
+proto: $(PROTO_OBJS)
+
+$(PROTO_SRCS): $(PROTOS)
+	protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
+	mkdir -p include/proto/
+	cp src/proto/*.pb.h include/proto/
+	@echo
+
+clean:
+	rm -rf *.a *.so
+	rm -rf include/proto/*
+	rm -rf src/proto/*.pb.h src/proto/*.pb.cc
+	rm -rf $(BUILD_DIR)
+	@echo

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 9d763f6..31ec68c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-SINGA
+Apache SINGA
 =====
 
 Distributed deep learning system
+
+[Project Page](http://singa.incubator.apache.org)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/cifar10/cluster.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/cluster.conf b/examples/cifar10/cluster.conf
new file mode 100644
index 0000000..1953d1d
--- /dev/null
+++ b/examples/cifar10/cluster.conf
@@ -0,0 +1,3 @@
+nworker_groups: 1
+nserver_groups: 1
+workspace: "/data1/wangwei/singa/data/mnist"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/cifar10/hostfile
----------------------------------------------------------------------
diff --git a/examples/cifar10/hostfile b/examples/cifar10/hostfile
new file mode 100644
index 0000000..83e06e5
--- /dev/null
+++ b/examples/cifar10/hostfile
@@ -0,0 +1,20 @@
+awan-2-26-0
+awan-2-27-0
+awan-2-28-0
+awan-2-29-0
+awan-2-30-0
+awan-2-31-0
+awan-2-32-0
+awan-2-33-0
+awan-2-34-0
+awan-2-35-0
+awan-2-36-0
+awan-2-37-0
+awan-2-38-0
+awan-2-39-0
+awan-2-40-0
+awan-2-41-0
+awan-2-42-0
+awan-2-43-0
+awan-2-44-0
+awan-2-45-0

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/cifar10/model.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/model.conf b/examples/cifar10/model.conf
new file mode 100644
index 0000000..9979c3b
--- /dev/null
+++ b/examples/cifar10/model.conf
@@ -0,0 +1,219 @@
+name: "cifar10-convnet"
+train_steps: 70000
+test_steps:100
+test_frequency:1000
+display_frequency:50
+updater{
+  momentum:0.9
+  weight_decay:0.004
+  learning_rate_change_method:kFixedStep
+  step:0
+  step:60000
+  step:65000
+  step_lr:0.001
+  step_lr:0.0001
+  step_lr:0.00001
+}
+neuralnet {
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/cifar10/cifar10_train_lmdb"
+    batchsize: 100
+    random_skip:10000
+  }
+  exclude: kTest
+}
+
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/cifar10/cifar10_test_lmdb"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+layer{
+  name:"rgb"
+  type: "kRGBImage"
+  srclayers: "data"
+  rgbimage_param {
+    meanfile: "/home/wangwei/program/singa/examples/cifar10/mean.binaryproto"
+  }
+}
+
+layer{
+  name: "label"
+  type: "kLabel"
+  srclayers: "data"
+}
+layer {
+  name: "conv1"
+  type: "kConvolution"
+  srclayers: "rgb"
+  convolution_param {
+    num_filters: 32
+    kernel: 5
+    stride: 1
+    pad:2
+  }
+  param{
+      name: "weight"
+      init_method:kGaussian
+      std:0.0001
+      learning_rate_multiplier:1.0
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      value:0
+    }
+}
+layer {
+  name: "pool1"
+  type: "kPooling"
+  srclayers: "conv1"
+  pooling_param {
+    pool: MAX
+    kernel: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "kReLU"
+  srclayers:"pool1"
+}
+layer {
+  name: "norm1"
+  type: "kLRN"
+  lrn_param {
+    norm_region: WITHIN_CHANNEL
+    local_size: 3
+    alpha: 5e-05
+    beta: 0.75
+  }
+  srclayers:"relu1"
+}
+layer {
+  name: "conv2"
+  type: "kConvolution"
+  srclayers: "norm1"
+  convolution_param {
+    num_filters: 32
+    kernel: 5
+    stride: 1
+    pad:2
+  }
+  param{
+      name: "weight"
+      init_method:kGaussian
+      std:0.01
+      learning_rate_multiplier:1.0
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      value:0
+    }
+}
+layer {
+  name: "relu2"
+  type: "kReLU"
+  srclayers:"conv2"
+}
+layer {
+  name: "pool2"
+  type: "kPooling"
+  srclayers: "relu2"
+  pooling_param {
+    pool: MAX
+    kernel: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm2"
+  type: "kLRN"
+  lrn_param {
+    norm_region: WITHIN_CHANNEL
+    local_size: 3
+    alpha: 5e-05
+    beta: 0.75
+  }
+  srclayers:"pool2"
+}
+layer {
+  name: "conv3"
+  type: "kConvolution"
+  srclayers: "norm2"
+  convolution_param {
+    num_filters: 64
+    kernel: 5
+    stride: 1
+    pad:2
+  }
+  param{
+      name: "weight"
+      init_method:kGaussian
+      std:0.01
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      value:0
+    }
+}
+layer {
+  name: "relu3"
+  type: "kReLU"
+  srclayers:"conv3"
+}
+layer {
+  name: "pool3"
+  type: "kPooling"
+  srclayers: "relu3"
+  pooling_param {
+    pool: AVE
+    kernel: 3
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "kInnerProduct"
+  srclayers:"pool3"
+  inner_product_param {
+    num_output: 10
+  }
+  param{
+      name: "weight"
+      init_method:kGaussian
+      std:0.01
+      learning_rate_multiplier:1.0
+      weight_decay_multiplier:250
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      weight_decay_multiplier:0
+      value:0
+  }
+}
+
+layer{
+  name: "loss"
+  type:"kSoftmaxLoss"
+  softmaxloss_param{
+    topk:1
+  }
+  srclayers:"ip1"
+  srclayers:"label"
+}
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/cifar10/run.sh
----------------------------------------------------------------------
diff --git a/examples/cifar10/run.sh b/examples/cifar10/run.sh
new file mode 100755
index 0000000..eb64047
--- /dev/null
+++ b/examples/cifar10/run.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];then
+  echo "Usage: run.sh [start|stop] num_procs"
+  exit
+fi
+
+netconf=conv.conf
+
+script_path=`readlink -f $0`
+script_dir=`dirname $script_path`
+example_dir=`dirname $script_dir`
+singa_dir=`dirname $example_dir`
+exec_path=${singa_dir}/build/singa
+host_path=$script_dir/hostfile
+ssh_options="-oStrictHostKeyChecking=no \
+-oUserKnownHostsFile=/dev/null \
+-oLogLevel=quiet"
+
+hosts=(`cat $host_path |cut -d ' ' -f 1`)
+if [ $1 == "start" ]
+then
+  count=0
+  for i in ${hosts[@]}
+  do
+    cmd="touch $singa_dir/$count.lock;\
+      $exec_path \
+      -procsID=$count \
+      -hostfile=$host_path \
+      -cluster_conf=$script_dir/cluster.conf \
+      -model_conf=$script_dir/$netconf; rm -f $singa_dir/$count.lock"
+    echo $cmd
+    ssh $ssh_options $i $cmd &
+    count=$(($count+1))
+    if [ $count -eq $2 ]
+    then
+      exit
+    fi
+  done
+elif [ $1 == "stop" ]
+then
+  for (( idx=$2-1 ; idx>=0 ; idx-- ))
+  do
+    echo "ssh ${hosts[$idx]} \"kill singa\""
+    ssh $ssh_options ${hosts[$idx]} "killall -q singa"
+    sleep 1
+  done
+fi
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/batch.sh
----------------------------------------------------------------------
diff --git a/examples/mnist/batch.sh b/examples/mnist/batch.sh
new file mode 100644
index 0000000..ff6b8b4
--- /dev/null
+++ b/examples/mnist/batch.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+for nservers in 1
+do
+  for nthreads in 2 4
+  do
+    for nworkers in 1 2 4 8 16
+    do
+      echo "nworkers: $nworkers" >examples/mnist/cluster.conf
+      echo "nservers: $nservers" >>examples/mnist/cluster.conf
+      echo "nthreads_per_server: $nthreads" >>examples/mnist/cluster.conf
+      echo  "workspace:\" /data1/wangwei/singa\"">>examples/mnist/cluster.conf
+      cat examples/mnist/cluster.conf
+      nprocs=$(($nworkers+$nservers))
+      log=log1k/${nworkers}w${nservers}s${nthreads}t
+      echo  $log $nprocs
+      ./examples/mnist/run.sh start $nprocs >$log 2>&1
+      sleep 4
+
+      while true
+      do
+        nstopped=0
+        to=$(($nprocs-1))
+        for worker in $(eval echo "{0..$to}")
+        do
+          if [ ! -e /home/wangwei/program/singa/$worker.lock ]
+          then
+            echo "$worker.lock is free"
+            nstopped=$(($nstopped+1))
+          fi
+        done
+        if [ $nstopped -eq $(($nprocs)) ]
+        then
+          break
+        else
+          sleep 5
+        fi
+      done
+    done
+  done
+done

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/cluster.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/cluster.conf b/examples/mnist/cluster.conf
new file mode 100644
index 0000000..1953d1d
--- /dev/null
+++ b/examples/mnist/cluster.conf
@@ -0,0 +1,3 @@
+nworker_groups: 1
+nserver_groups: 1
+workspace: "/data1/wangwei/singa/data/mnist"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
new file mode 100644
index 0000000..5f3bf58
--- /dev/null
+++ b/examples/mnist/conv.conf
@@ -0,0 +1,175 @@
+name: "mnist-conv"
+train_steps: 10000
+test_steps:100
+test_frequency:500
+display_frequency:50
+debug: false
+updater{
+  base_learning_rate:0.01
+  momentum:0.9
+  weight_decay:0.0005
+  gamma:0.0001
+  pow:0.75
+  learning_rate_change_method:kInverse
+}
+neuralnet {
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/mnist/mnist_train_lmdb"
+    batchsize: 64
+  }
+  exclude: kTest
+}
+
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/mnist/mnist_test_lmdb"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+layer{
+  name:"mnist"
+  type: "kMnistImage"
+  srclayers: "data"
+  mnist_param {
+#    sigma: 6
+#    alpha: 38
+#    gamma: 15
+#    kernel: 21
+#    elastic_freq:100
+#    beta:15
+#    resize: 29
+    norm_a:255
+  }
+}
+
+
+layer{
+  name: "label"
+  type: "kLabel"
+  srclayers: "data"
+}
+layer {
+  name: "conv1"
+  type: "kConvolution"
+  srclayers: "mnist"
+  convolution_param {
+    num_filters: 20
+    kernel: 5
+    stride: 1
+  }
+  param{
+      name: "weight"
+      init_method:kUniformSqrtFanIn
+      learning_rate_multiplier:1.0
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      value:0
+    }
+}
+layer {
+  name: "pool1"
+  type: "kPooling"
+  srclayers: "conv1"
+  pooling_param {
+    pool: MAX
+    kernel: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "kConvolution"
+  srclayers: "pool1"
+  convolution_param {
+    num_filters: 50
+    kernel: 5
+    stride: 1
+  }
+  param{
+      name: "weight"
+      init_method:kUniformSqrtFanIn
+      learning_rate_multiplier:1.0
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      value:0
+    }
+}
+layer {
+  name: "pool2"
+  type: "kPooling"
+  srclayers: "conv2"
+  pooling_param {
+    pool: MAX
+    kernel: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "kInnerProduct"
+  srclayers:"pool2"
+  inner_product_param {
+    num_output: 500
+  }
+  param{
+      name: "weight"
+      init_method:kUniformSqrtFanIn
+      learning_rate_multiplier:1.0
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2.0
+      value:0
+  }
+
+}
+
+layer {
+  name: "relu1"
+  type: "kReLU"
+  srclayers:"ip1"
+}
+
+layer {
+  name: "ip2"
+  type: "kInnerProduct"
+  srclayers:"relu1"
+  inner_product_param {
+    num_output: 10
+  }
+  param{
+      name: "weight"
+      init_method:kUniformSqrtFanIn
+      learning_rate_multiplier:1
+    }
+  param{
+      name: "bias"
+      init_method: kConstant
+      learning_rate_multiplier:2
+      value:0
+    }
+}
+layer{
+  name: "loss"
+  type:"kSoftmaxLoss"
+  softmaxloss_param{
+    topk:1
+  }
+  srclayers:"ip2"
+  srclayers:"label"
+}
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/hostfile
----------------------------------------------------------------------
diff --git a/examples/mnist/hostfile b/examples/mnist/hostfile
new file mode 100644
index 0000000..1781444
--- /dev/null
+++ b/examples/mnist/hostfile
@@ -0,0 +1,8 @@
+192.168.26.10
+192.168.26.11
+192.168.26.12
+192.168.26.13
+192.168.26.15
+192.168.26.16
+192.168.26.17
+192.168.26.18

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/mlp.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/mlp.conf b/examples/mnist/mlp.conf
new file mode 100644
index 0000000..d0ed08f
--- /dev/null
+++ b/examples/mnist/mlp.conf
@@ -0,0 +1,223 @@
+name: "deep-big-simple-mlp"
+train_steps: 10000
+test_steps:10
+test_frequency:60
+display_frequency:30
+checkpoint_frequency:120
+updater{
+  base_learning_rate: 0.001
+  learning_rate_change_method: kStep
+  learning_rate_change_frequency: 60
+  gamma: 0.997
+  param_type: "Param"
+}
+
+neuralnet {
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/mnist/mnist_train_lmdb"
+    batchsize: 1000
+    random_skip: 10000
+  }
+  exclude: kTest
+}
+
+layer {
+  name: "data"
+  type: "kLMDBData"
+  data_param {
+    path: "/home/wangwei/program/singa/examples/mnist/mnist_test_lmdb"
+    batchsize: 1000
+  }
+  exclude: kTrain
+}
+
+layer{
+  name:"mnist"
+  type: "kMnistImage"
+  srclayers: "data"
+  mnist_param {
+#    sigma: 6
+#    alpha: 38
+#    gamma: 15
+#    kernel: 21
+#    elastic_freq:100
+#    beta:15
+#    resize: 29
+    norm_a: 127.5
+    norm_b: 1
+  }
+}
+
+
+layer{
+  name: "label"
+  type: "kLabel"
+  srclayers: "data"
+}
+
+layer{
+  name: "fc1"
+  type: "kInnerProduct"
+  srclayers:"mnist"
+  inner_product_param{
+    num_output: 2500
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+}
+
+layer{
+  name: "tanh1"
+  type:"kTanh"
+  srclayers:"fc1"
+}
+layer{
+  name: "fc2"
+  type: "kInnerProduct"
+  srclayers:"tanh1"
+  inner_product_param{
+    num_output: 2000
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+}
+
+layer{
+  name: "tanh2"
+  type:"kTanh"
+  srclayers:"fc2"
+}
+layer{
+  name: "fc3"
+  type: "kInnerProduct"
+  srclayers:"tanh2"
+  inner_product_param{
+    num_output: 1500
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+
+}
+
+layer{
+  name: "tanh3"
+  type:"kTanh"
+  srclayers:"fc3"
+}
+layer{
+  name: "fc4"
+  type: "kInnerProduct"
+  srclayers:"tanh3"
+  inner_product_param{
+    num_output: 1000
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+
+}
+
+layer{
+  name: "tanh4"
+  type:"kTanh"
+  srclayers:"fc4"
+}
+layer{
+  name: "fc5"
+  type: "kInnerProduct"
+  srclayers:"tanh4"
+  inner_product_param{
+    num_output: 500
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+
+}
+
+layer{
+  name: "tanh5"
+  type:"kTanh"
+  srclayers:"fc5"
+}
+layer{
+  name: "fc6"
+  type: "kInnerProduct"
+  srclayers:"tanh5"
+  inner_product_param{
+    num_output: 10
+  }
+  param{
+    name: "weight"
+    init_method: kUniform
+    low:-0.05
+    high:0.05
+  }
+  param{
+    name: "bias"
+    init_method: kUniform
+    low: -0.05
+    high:0.05
+  }
+}
+layer{
+  name: "loss"
+  type:"kSoftmaxLoss"
+  softmaxloss_param{
+    topk:1
+  }
+  srclayers:"fc6"
+  srclayers:"label"
+}
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/examples/mnist/run.sh
----------------------------------------------------------------------
diff --git a/examples/mnist/run.sh b/examples/mnist/run.sh
new file mode 100755
index 0000000..1b16ca9
--- /dev/null
+++ b/examples/mnist/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];then
+  echo "Usage: run.sh [start|stop] num_procs"
+  exit
+fi
+
+script_path=`readlink -f $0`
+script_dir=`dirname $script_path`
+example_dir=`dirname $script_dir`
+singa_dir=`dirname $example_dir`
+exec_path=${singa_dir}/build/pm
+host_path=$script_dir/hostfile
+model_path=$script_dir/mlp.conf
+cluster_path=$script_dir/cluster.conf
+ssh_options="-oStrictHostKeyChecking=no \
+-oUserKnownHostsFile=/dev/null"
+
+hosts=(`cat $host_path |cut -d ' ' -f 1`)
+params=(`cat $host_path | cut -d ' ' -f 2`)
+if [ $1 == "start" ]
+then
+  rm -rf $singa_dir/log*
+  for (( i=0; i<$2; i++ ))
+  do
+   	cmd="source ~/.bash_profile; touch $singa_dir/$i.lock;\
+      $exec_path  --hostfile=$script_dir/hostfile --procs_id=$i\
+      --model=${modelfile} --cluster=${clusterfile}"
+    echo ${hosts[$i]} $ssh_options  $cmd
+    ssh $ssh_options ${hosts[$i]} $cmd &
+  done
+elif [ $1 == "stop" ]
+then
+  for (( idx=0 ; idx<$2 ; idx++ ))
+  do
+    echo "ssh ${hosts[$idx]} \"kill pm\""
+    ssh $ssh_options ${hosts[$idx]} "killall -q pm"
+    sleep 1
+  done
+fi

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/communication/msg.h
----------------------------------------------------------------------
diff --git a/include/communication/msg.h b/include/communication/msg.h
new file mode 100644
index 0000000..54b4601
--- /dev/null
+++ b/include/communication/msg.h
@@ -0,0 +1,187 @@
+#ifndef INCLUDE_COMMUNICATION_MSG_H_
+#define INCLUDE_COMMUNICATION_MSG_H_
+#include <string>
+#include <czmq.h>
+#include <glog/logging.h>
+
+using std::string;
+namespace singa {
+class BaseMsg{
+  public:
+  /**
+    * Destructor to free memory
+    */
+  virtual ~BaseMsg(){};
+  /**
+    * @param group_id worker/server group id
+    * @param id worker/server id within the group
+    * @param flag 0 for server, 1 for worker, 2 for stub
+    */
+  virtual void set_src(int group_id, int id, int flag)=0;
+  virtual void set_dst(int group_id, int id, int flag)=0;
+  virtual void set_src(int procs_id, int flag)=0;
+  virtual void set_dst(int procs_id, int flag)=0;
+  virtual int src_group_id() const=0;
+  virtual int dst_group_id() const=0;
+  virtual int src_id() const=0;
+  virtual int dst_id() const=0;
+  virtual int src_flag() const=0;
+  virtual int dst_flag() const=0;
+  virtual void set_type(int type)=0;
+  virtual int type() const=0;
+  virtual void set_target(int target)=0;
+  virtual int target() const=0;
+
+  /**
+   * Copy src and dst address, including group_id, id, flag
+   */
+  virtual BaseMsg* CopyAddr()=0;
+  virtual void SetAddr(BaseMsg* msg)=0;
+
+  /**
+   * Add a frame (a chunck of bytes) into the message
+   */
+  virtual void add_frame(const void*, int nBytes)=0;
+  virtual int frame_size()=0;
+  virtual void* frame_data()=0;
+  /**
+    * Move the cursor to the next frame
+    * @return true if the next frame is not NULL; otherwise false
+    */
+  virtual bool next_frame()=0;
+};
+
+// TODO make it a compiler argument
+#define USE_ZMQ
+
+#ifdef USE_ZMQ
+class Msg : public BaseMsg{
+ public:
+  Msg() {
+    msg_=zmsg_new();
+  }
+  virtual ~Msg(){
+    if(msg_!=NULL)
+      zmsg_destroy(&msg_);
+  }
+  virtual void set_src(int group_id, int id, int flag){
+    src_=(group_id<<kOff1)|(id<<kOff2)|flag;
+  }
+  virtual void set_dst(int group_id, int id, int flag){
+    dst_=(group_id<<kOff1)|(id<<kOff2)|flag;
+  }
+  virtual void set_src(int procs_id, int flag){
+    set_src(procs_id, 0, flag);
+  }
+  virtual void set_dst(int procs_id, int flag){
+    set_dst(procs_id, 0, flag);
+  }
+  int src() const {
+    return src_;
+  }
+  int dst() const {
+    return dst_;
+  }
+  virtual int src_group_id() const {
+    int ret=src_>>kOff1;
+    return ret;
+  }
+
+  virtual int dst_group_id() const{
+    int ret=dst_>>kOff1;
+    return ret;
+  }
+  virtual int src_id() const{
+    int ret=(src_&kMask1)>>kOff2;
+    return ret;
+  }
+  virtual int dst_id() const{
+    int ret=(dst_&kMask1)>>kOff2;
+    return ret;
+  }
+  virtual int src_flag() const{
+    int ret=src_&kMask2;
+    return ret;
+  }
+  virtual int dst_flag() const{
+    int ret=dst_&kMask2;
+    return ret;
+  }
+
+  void SwapAddr(){
+    std::swap(src_,dst_);
+  }
+
+  virtual void set_type(int type){
+    target_=(type<<kOff3)|(target_&kMask3);
+  }
+  virtual void set_target(int target){
+    target_=(target_>>kOff3)<<kOff3;
+    target_=target_|target;
+  }
+  virtual int type() const{
+    int ret=target_>>kOff3;
+    return ret;
+  }
+  virtual int target() const{
+    int ret=target_&kMask3;
+    return ret;
+  }
+
+  virtual BaseMsg* CopyAddr(){
+    Msg* msg=new Msg();
+    msg->src_=src_;
+    msg->dst_=dst_;
+    return msg;
+  }
+
+  virtual void SetAddr(BaseMsg* msg){
+    src_=(static_cast<Msg*>(msg))->src_;
+    dst_=(static_cast<Msg*>(msg))->dst_;
+  }
+
+  virtual void add_frame(const void* addr, int nBytes){
+    zmsg_addmem(msg_, addr, nBytes);
+  }
+  virtual int frame_size(){
+    return zframe_size(frame_);
+  }
+
+  virtual void* frame_data(){
+    return zframe_data(frame_);
+  }
+
+  virtual bool next_frame(){
+    frame_=zmsg_next(msg_);
+    return frame_!=NULL;
+  }
+
+  void ParseFromZmsg(zmsg_t* msg){
+    char* tmp=zmsg_popstr(msg);
+    sscanf(tmp, "%d %d %d", &src_, &dst_, &target_);
+    //LOG(ERROR)<<"recv "<<src_<<" "<<dst_<<" "<<target_;
+    frame_=zmsg_next(msg);
+    msg_=msg;
+  }
+
+  zmsg_t* DumpToZmsg(){
+    zmsg_pushstrf(msg_, "%d %d %d",src_, dst_,target_);
+    //LOG(ERROR)<<"send "<<src_<<" "<<dst_<<" "<<target_;
+    zmsg_t* tmp=msg_;
+    msg_=NULL;
+    return tmp;
+  }
+
+ protected:
+  static const unsigned int kOff1=16, kOff2=4, kOff3=24;
+  static const unsigned int kMask1=(1<<kOff1)-1, kMask2=(1<<kOff2)-1,
+               kMask3=(1<<kOff3)-1;
+  unsigned int src_, dst_, target_;
+  zmsg_t* msg_;
+  zframe_t *frame_;
+};
+#endif
+
+} /* singa */
+
+#endif // INCLUDE_COMMUNICATION_MSG_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/communication/socket.h
----------------------------------------------------------------------
diff --git a/include/communication/socket.h b/include/communication/socket.h
new file mode 100644
index 0000000..771c3da
--- /dev/null
+++ b/include/communication/socket.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDE_COMMUNICATION_SOCKET_H_
+#define INCLUDE_COMMUNICATION_SOCKET_H_
+#include <map>
+#include <vector>
+#include "communication/msg.h"
+namespace singa {
+
+const string kInprocRouterEndpoint="inproc://router";
+class Socket{
+  public:
+  Socket(){}
+  virtual ~Socket(){}
+  /**
+    * Send a message to connected socket(s), non-blocking. The message will
+    * be deallocated after sending, thus should not be used after calling Send();
+    * @param  the message to be sent
+    * @return 1 for success queuing the message for sending, 0 for failure
+    */
+  virtual int Send(Msg* msg)=0;
+  /**
+    * Receive a message from any connected socket.
+    *
+    * @return a message pointer if success; nullptr if failure
+    */
+  virtual Msg* Receive()=0;
+  /**
+   * @return Identifier of the implementation dependent socket. E.g., zsock_t*
+   * for ZeroMQ implementation and rank for MPI implementation.
+   */
+  virtual void* InternalID() const=0;
+
+ protected:
+  int local_id_;
+};
+
+class BasePoller{
+ public:
+  /**
+    * Add a socket for polling; Multiple sockets can be polled together by
+    * adding them into the same poller.
+    */
+  virtual void Add(Socket* socket)=0;
+  /**
+    * Poll for all sockets added into this poller.
+    * @param timeout stop after this number of mseconds
+    * @return pointer to the socket if it has one message in the receiving
+    * queue; nullptr if no message in any sockets,
+    */
+  virtual Socket* Wait(int timeout)=0;
+};
+
+#define USE_ZMQ
+#include <czmq.h>
+
+#ifdef USE_ZMQ
+class Poller: public BasePoller{
+ public:
+  Poller();
+  virtual void Add(Socket* socket);
+  virtual Socket* Wait(int duration);
+ protected:
+  zpoller_t *poller_;
+  std::map<zsock_t*, Socket*> zsock2Socket_;
+};
+
+class Dealer : public Socket{
+ public:
+  /*
+   * @param id local dealer ID within a procs if the dealer is from worker or
+   * server thread, starts from 1 (0 is used by the router); or the connected
+   * remote procs ID for inter-process dealers from the stub thread.
+   */
+  Dealer(int id=-1);
+  virtual ~Dealer();
+  /**
+    * Setup the connection with the router.
+    *
+    * @param endpoint identifier of the router. For intra-process
+    * connection, the endpoint follows the format of ZeroMQ, i.e.,
+    * starting with "inproc://"; in Singa, since each process has one
+    * router, hence we can fix the endpoint to be "inproc://router" for
+    * intra-process. For inter-process, the endpoint follows ZeroMQ's
+    * format, i.e., IP:port, where IP is the connected process.
+    * @return 1 connection sets up successfully; 0 otherwise
+    */
+  virtual int Connect(string endpoint);
+  virtual int Send(Msg* msg);
+  virtual Msg* Receive();
+  virtual void* InternalID() const{
+    return dealer_;
+  }
+ protected:
+  int id_;
+  zsock_t* dealer_;
+  zpoller_t* poller_;
+};
+
+class Router : public Socket{
+ public:
+  virtual ~Router();
+  /**
+   * Constructor.
+   *
+   * There is only one router per procs, hence its local id is 0 and is not set
+   * explicitly.
+   *
+   * @param bufsize buffer at most this number of messages
+   */
+  Router(int bufsize=100);
+ /**
+  * Setup the connection with dealers.
+  *
+  * It automatically binds to the endpoint for intra-process communication,
+  * i.e., "inproc://router".
+  *
+  * @param endpoint the identifier for the Dealer socket in other process
+  * to connect. It has the format IP:Port, where IP is the host machine.
+  * If endpoint is empty, it means that all connections are
+  * intra-process connection.
+  * @return number of connected dealers.
+  */
+  virtual int Bind(string endpoint);
+ /**
+   * If the destination socket has not connected yet, buffer this the message.
+   */
+  virtual int Send(Msg* msg);
+  virtual Msg* Receive();
+  virtual void* InternalID() const{
+    return router_;
+  }
+ protected:
+  zsock_t* router_;
+  zpoller_t* poller_;
+  std::map<int, zframe_t*> id2addr_;
+  std::map<int, std::vector<zmsg_t*>> bufmsg_;
+  int nBufmsg_, bufsize_;
+};
+
+#elif USE_MPI
+vector<shared_ptr<SafeQueue>> MPIQueues;
+#endif
+} /* singa */
+
+#endif // INCLUDE_COMMUNICATION_SOCKET_H_


[05/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/proto/cluster.pb.h
----------------------------------------------------------------------
diff --git a/src/proto/cluster.pb.h b/src/proto/cluster.pb.h
new file mode 100644
index 0000000..fce32b8
--- /dev/null
+++ b/src/proto/cluster.pb.h
@@ -0,0 +1,989 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: cluster.proto
+
+#ifndef PROTOBUF_cluster_2eproto__INCLUDED
+#define PROTOBUF_cluster_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 2005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 2005000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>
+#include <google/protobuf/extension_set.h>
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+
+namespace singa {
+
+// Internal implementation detail -- do not call these.
+void  protobuf_AddDesc_cluster_2eproto();
+void protobuf_AssignDesc_cluster_2eproto();
+void protobuf_ShutdownFile_cluster_2eproto();
+
+class ClusterProto;
+class ServerTopology;
+
+// ===================================================================
+
+class ClusterProto : public ::google::protobuf::Message {
+ public:
+  ClusterProto();
+  virtual ~ClusterProto();
+
+  ClusterProto(const ClusterProto& from);
+
+  inline ClusterProto& operator=(const ClusterProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ClusterProto& default_instance();
+
+  void Swap(ClusterProto* other);
+
+  // implements Message ----------------------------------------------
+
+  ClusterProto* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ClusterProto& from);
+  void MergeFrom(const ClusterProto& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // optional int32 nworker_groups = 1;
+  inline bool has_nworker_groups() const;
+  inline void clear_nworker_groups();
+  static const int kNworkerGroupsFieldNumber = 1;
+  inline ::google::protobuf::int32 nworker_groups() const;
+  inline void set_nworker_groups(::google::protobuf::int32 value);
+
+  // optional int32 nserver_groups = 2;
+  inline bool has_nserver_groups() const;
+  inline void clear_nserver_groups();
+  static const int kNserverGroupsFieldNumber = 2;
+  inline ::google::protobuf::int32 nserver_groups() const;
+  inline void set_nserver_groups(::google::protobuf::int32 value);
+
+  // optional int32 nworkers_per_group = 3 [default = 1];
+  inline bool has_nworkers_per_group() const;
+  inline void clear_nworkers_per_group();
+  static const int kNworkersPerGroupFieldNumber = 3;
+  inline ::google::protobuf::int32 nworkers_per_group() const;
+  inline void set_nworkers_per_group(::google::protobuf::int32 value);
+
+  // optional int32 nservers_per_group = 4 [default = 1];
+  inline bool has_nservers_per_group() const;
+  inline void clear_nservers_per_group();
+  static const int kNserversPerGroupFieldNumber = 4;
+  inline ::google::protobuf::int32 nservers_per_group() const;
+  inline void set_nservers_per_group(::google::protobuf::int32 value);
+
+  // optional int32 nworkers_per_procs = 5 [default = 1];
+  inline bool has_nworkers_per_procs() const;
+  inline void clear_nworkers_per_procs();
+  static const int kNworkersPerProcsFieldNumber = 5;
+  inline ::google::protobuf::int32 nworkers_per_procs() const;
+  inline void set_nworkers_per_procs(::google::protobuf::int32 value);
+
+  // optional int32 nservers_per_procs = 6 [default = 1];
+  inline bool has_nservers_per_procs() const;
+  inline void clear_nservers_per_procs();
+  static const int kNserversPerProcsFieldNumber = 6;
+  inline ::google::protobuf::int32 nservers_per_procs() const;
+  inline void set_nservers_per_procs(::google::protobuf::int32 value);
+
+  // optional string hostfile = 10;
+  inline bool has_hostfile() const;
+  inline void clear_hostfile();
+  static const int kHostfileFieldNumber = 10;
+  inline const ::std::string& hostfile() const;
+  inline void set_hostfile(const ::std::string& value);
+  inline void set_hostfile(const char* value);
+  inline void set_hostfile(const char* value, size_t size);
+  inline ::std::string* mutable_hostfile();
+  inline ::std::string* release_hostfile();
+  inline void set_allocated_hostfile(::std::string* hostfile);
+
+  // optional bool server_worker_separate = 11 [default = false];
+  inline bool has_server_worker_separate() const;
+  inline void clear_server_worker_separate();
+  static const int kServerWorkerSeparateFieldNumber = 11;
+  inline bool server_worker_separate() const;
+  inline void set_server_worker_separate(bool value);
+
+  // optional int32 nprocs = 12;
+  inline bool has_nprocs() const;
+  inline void clear_nprocs();
+  static const int kNprocsFieldNumber = 12;
+  inline ::google::protobuf::int32 nprocs() const;
+  inline void set_nprocs(::google::protobuf::int32 value);
+
+  // optional int32 start_port = 13 [default = 6723];
+  inline bool has_start_port() const;
+  inline void clear_start_port();
+  static const int kStartPortFieldNumber = 13;
+  inline ::google::protobuf::int32 start_port() const;
+  inline void set_start_port(::google::protobuf::int32 value);
+
+  // required string workspace = 14;
+  inline bool has_workspace() const;
+  inline void clear_workspace();
+  static const int kWorkspaceFieldNumber = 14;
+  inline const ::std::string& workspace() const;
+  inline void set_workspace(const ::std::string& value);
+  inline void set_workspace(const char* value);
+  inline void set_workspace(const char* value, size_t size);
+  inline ::std::string* mutable_workspace();
+  inline ::std::string* release_workspace();
+  inline void set_allocated_workspace(::std::string* workspace);
+
+  // optional string log_dir = 15;
+  inline bool has_log_dir() const;
+  inline void clear_log_dir();
+  static const int kLogDirFieldNumber = 15;
+  inline const ::std::string& log_dir() const;
+  inline void set_log_dir(const ::std::string& value);
+  inline void set_log_dir(const char* value);
+  inline void set_log_dir(const char* value, size_t size);
+  inline ::std::string* mutable_log_dir();
+  inline ::std::string* release_log_dir();
+  inline void set_allocated_log_dir(::std::string* log_dir);
+
+  // repeated .singa.ServerTopology server_group = 20;
+  inline int server_group_size() const;
+  inline void clear_server_group();
+  static const int kServerGroupFieldNumber = 20;
+  inline const ::singa::ServerTopology& server_group(int index) const;
+  inline ::singa::ServerTopology* mutable_server_group(int index);
+  inline ::singa::ServerTopology* add_server_group();
+  inline const ::google::protobuf::RepeatedPtrField< ::singa::ServerTopology >&
+      server_group() const;
+  inline ::google::protobuf::RepeatedPtrField< ::singa::ServerTopology >*
+      mutable_server_group();
+
+  // optional int32 stub_timeout = 30 [default = 5000];
+  inline bool has_stub_timeout() const;
+  inline void clear_stub_timeout();
+  static const int kStubTimeoutFieldNumber = 30;
+  inline ::google::protobuf::int32 stub_timeout() const;
+  inline void set_stub_timeout(::google::protobuf::int32 value);
+
+  // optional int32 worker_timeout = 31 [default = 5000];
+  inline bool has_worker_timeout() const;
+  inline void clear_worker_timeout();
+  static const int kWorkerTimeoutFieldNumber = 31;
+  inline ::google::protobuf::int32 worker_timeout() const;
+  inline void set_worker_timeout(::google::protobuf::int32 value);
+
+  // optional int32 server_timeout = 32 [default = 5000];
+  inline bool has_server_timeout() const;
+  inline void clear_server_timeout();
+  static const int kServerTimeoutFieldNumber = 32;
+  inline ::google::protobuf::int32 server_timeout() const;
+  inline void set_server_timeout(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:singa.ClusterProto)
+ private:
+  inline void set_has_nworker_groups();
+  inline void clear_has_nworker_groups();
+  inline void set_has_nserver_groups();
+  inline void clear_has_nserver_groups();
+  inline void set_has_nworkers_per_group();
+  inline void clear_has_nworkers_per_group();
+  inline void set_has_nservers_per_group();
+  inline void clear_has_nservers_per_group();
+  inline void set_has_nworkers_per_procs();
+  inline void clear_has_nworkers_per_procs();
+  inline void set_has_nservers_per_procs();
+  inline void clear_has_nservers_per_procs();
+  inline void set_has_hostfile();
+  inline void clear_has_hostfile();
+  inline void set_has_server_worker_separate();
+  inline void clear_has_server_worker_separate();
+  inline void set_has_nprocs();
+  inline void clear_has_nprocs();
+  inline void set_has_start_port();
+  inline void clear_has_start_port();
+  inline void set_has_workspace();
+  inline void clear_has_workspace();
+  inline void set_has_log_dir();
+  inline void clear_has_log_dir();
+  inline void set_has_stub_timeout();
+  inline void clear_has_stub_timeout();
+  inline void set_has_worker_timeout();
+  inline void clear_has_worker_timeout();
+  inline void set_has_server_timeout();
+  inline void clear_has_server_timeout();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 nworker_groups_;
+  ::google::protobuf::int32 nserver_groups_;
+  ::google::protobuf::int32 nworkers_per_group_;
+  ::google::protobuf::int32 nservers_per_group_;
+  ::google::protobuf::int32 nworkers_per_procs_;
+  ::google::protobuf::int32 nservers_per_procs_;
+  ::std::string* hostfile_;
+  bool server_worker_separate_;
+  ::google::protobuf::int32 nprocs_;
+  ::std::string* workspace_;
+  ::std::string* log_dir_;
+  ::google::protobuf::int32 start_port_;
+  ::google::protobuf::int32 stub_timeout_;
+  ::google::protobuf::RepeatedPtrField< ::singa::ServerTopology > server_group_;
+  ::google::protobuf::int32 worker_timeout_;
+  ::google::protobuf::int32 server_timeout_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(16 + 31) / 32];
+
+  friend void  protobuf_AddDesc_cluster_2eproto();
+  friend void protobuf_AssignDesc_cluster_2eproto();
+  friend void protobuf_ShutdownFile_cluster_2eproto();
+
+  void InitAsDefaultInstance();
+  static ClusterProto* default_instance_;
+};
+// -------------------------------------------------------------------
+
+class ServerTopology : public ::google::protobuf::Message {
+ public:
+  ServerTopology();
+  virtual ~ServerTopology();
+
+  ServerTopology(const ServerTopology& from);
+
+  inline ServerTopology& operator=(const ServerTopology& from) {
+    CopyFrom(from);
+    return *this;
+  }
+
+  inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
+    return _unknown_fields_;
+  }
+
+  inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
+    return &_unknown_fields_;
+  }
+
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const ServerTopology& default_instance();
+
+  void Swap(ServerTopology* other);
+
+  // implements Message ----------------------------------------------
+
+  ServerTopology* New() const;
+  void CopyFrom(const ::google::protobuf::Message& from);
+  void MergeFrom(const ::google::protobuf::Message& from);
+  void CopyFrom(const ServerTopology& from);
+  void MergeFrom(const ServerTopology& from);
+  void Clear();
+  bool IsInitialized() const;
+
+  int ByteSize() const;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input);
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const;
+  ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
+  int GetCachedSize() const { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const;
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // required int32 id = 1;
+  inline bool has_id() const;
+  inline void clear_id();
+  static const int kIdFieldNumber = 1;
+  inline ::google::protobuf::int32 id() const;
+  inline void set_id(::google::protobuf::int32 value);
+
+  // optional int32 sync_interval = 2;
+  inline bool has_sync_interval() const;
+  inline void clear_sync_interval();
+  static const int kSyncIntervalFieldNumber = 2;
+  inline ::google::protobuf::int32 sync_interval() const;
+  inline void set_sync_interval(::google::protobuf::int32 value);
+
+  // repeated int32 neighbor = 3;
+  inline int neighbor_size() const;
+  inline void clear_neighbor();
+  static const int kNeighborFieldNumber = 3;
+  inline ::google::protobuf::int32 neighbor(int index) const;
+  inline void set_neighbor(int index, ::google::protobuf::int32 value);
+  inline void add_neighbor(::google::protobuf::int32 value);
+  inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      neighbor() const;
+  inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_neighbor();
+
+  // @@protoc_insertion_point(class_scope:singa.ServerTopology)
+ private:
+  inline void set_has_id();
+  inline void clear_has_id();
+  inline void set_has_sync_interval();
+  inline void clear_has_sync_interval();
+
+  ::google::protobuf::UnknownFieldSet _unknown_fields_;
+
+  ::google::protobuf::int32 id_;
+  ::google::protobuf::int32 sync_interval_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > neighbor_;
+
+  mutable int _cached_size_;
+  ::google::protobuf::uint32 _has_bits_[(3 + 31) / 32];
+
+  friend void  protobuf_AddDesc_cluster_2eproto();
+  friend void protobuf_AssignDesc_cluster_2eproto();
+  friend void protobuf_ShutdownFile_cluster_2eproto();
+
+  void InitAsDefaultInstance();
+  static ServerTopology* default_instance_;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+// ClusterProto
+
+// optional int32 nworker_groups = 1;
+inline bool ClusterProto::has_nworker_groups() const {
+  return (_has_bits_[0] & 0x00000001u) != 0;
+}
+inline void ClusterProto::set_has_nworker_groups() {
+  _has_bits_[0] |= 0x00000001u;
+}
+inline void ClusterProto::clear_has_nworker_groups() {
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline void ClusterProto::clear_nworker_groups() {
+  nworker_groups_ = 0;
+  clear_has_nworker_groups();
+}
+inline ::google::protobuf::int32 ClusterProto::nworker_groups() const {
+  return nworker_groups_;
+}
+inline void ClusterProto::set_nworker_groups(::google::protobuf::int32 value) {
+  set_has_nworker_groups();
+  nworker_groups_ = value;
+}
+
+// optional int32 nserver_groups = 2;
+inline bool ClusterProto::has_nserver_groups() const {
+  return (_has_bits_[0] & 0x00000002u) != 0;
+}
+inline void ClusterProto::set_has_nserver_groups() {
+  _has_bits_[0] |= 0x00000002u;
+}
+inline void ClusterProto::clear_has_nserver_groups() {
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline void ClusterProto::clear_nserver_groups() {
+  nserver_groups_ = 0;
+  clear_has_nserver_groups();
+}
+inline ::google::protobuf::int32 ClusterProto::nserver_groups() const {
+  return nserver_groups_;
+}
+inline void ClusterProto::set_nserver_groups(::google::protobuf::int32 value) {
+  set_has_nserver_groups();
+  nserver_groups_ = value;
+}
+
+// optional int32 nworkers_per_group = 3 [default = 1];
+inline bool ClusterProto::has_nworkers_per_group() const {
+  return (_has_bits_[0] & 0x00000004u) != 0;
+}
+inline void ClusterProto::set_has_nworkers_per_group() {
+  _has_bits_[0] |= 0x00000004u;
+}
+inline void ClusterProto::clear_has_nworkers_per_group() {
+  _has_bits_[0] &= ~0x00000004u;
+}
+inline void ClusterProto::clear_nworkers_per_group() {
+  nworkers_per_group_ = 1;
+  clear_has_nworkers_per_group();
+}
+inline ::google::protobuf::int32 ClusterProto::nworkers_per_group() const {
+  return nworkers_per_group_;
+}
+inline void ClusterProto::set_nworkers_per_group(::google::protobuf::int32 value) {
+  set_has_nworkers_per_group();
+  nworkers_per_group_ = value;
+}
+
+// optional int32 nservers_per_group = 4 [default = 1];
+inline bool ClusterProto::has_nservers_per_group() const {
+  return (_has_bits_[0] & 0x00000008u) != 0;
+}
+inline void ClusterProto::set_has_nservers_per_group() {
+  _has_bits_[0] |= 0x00000008u;
+}
+inline void ClusterProto::clear_has_nservers_per_group() {
+  _has_bits_[0] &= ~0x00000008u;
+}
+inline void ClusterProto::clear_nservers_per_group() {
+  nservers_per_group_ = 1;
+  clear_has_nservers_per_group();
+}
+inline ::google::protobuf::int32 ClusterProto::nservers_per_group() const {
+  return nservers_per_group_;
+}
+inline void ClusterProto::set_nservers_per_group(::google::protobuf::int32 value) {
+  set_has_nservers_per_group();
+  nservers_per_group_ = value;
+}
+
+// optional int32 nworkers_per_procs = 5 [default = 1];
+inline bool ClusterProto::has_nworkers_per_procs() const {
+  return (_has_bits_[0] & 0x00000010u) != 0;
+}
+inline void ClusterProto::set_has_nworkers_per_procs() {
+  _has_bits_[0] |= 0x00000010u;
+}
+inline void ClusterProto::clear_has_nworkers_per_procs() {
+  _has_bits_[0] &= ~0x00000010u;
+}
+inline void ClusterProto::clear_nworkers_per_procs() {
+  nworkers_per_procs_ = 1;
+  clear_has_nworkers_per_procs();
+}
+inline ::google::protobuf::int32 ClusterProto::nworkers_per_procs() const {
+  return nworkers_per_procs_;
+}
+inline void ClusterProto::set_nworkers_per_procs(::google::protobuf::int32 value) {
+  set_has_nworkers_per_procs();
+  nworkers_per_procs_ = value;
+}
+
+// optional int32 nservers_per_procs = 6 [default = 1];
+inline bool ClusterProto::has_nservers_per_procs() const {
+  return (_has_bits_[0] & 0x00000020u) != 0;
+}
+inline void ClusterProto::set_has_nservers_per_procs() {
+  _has_bits_[0] |= 0x00000020u;
+}
+inline void ClusterProto::clear_has_nservers_per_procs() {
+  _has_bits_[0] &= ~0x00000020u;
+}
+inline void ClusterProto::clear_nservers_per_procs() {
+  nservers_per_procs_ = 1;
+  clear_has_nservers_per_procs();
+}
+inline ::google::protobuf::int32 ClusterProto::nservers_per_procs() const {
+  return nservers_per_procs_;
+}
+inline void ClusterProto::set_nservers_per_procs(::google::protobuf::int32 value) {
+  set_has_nservers_per_procs();
+  nservers_per_procs_ = value;
+}
+
+// optional string hostfile = 10;
+inline bool ClusterProto::has_hostfile() const {
+  return (_has_bits_[0] & 0x00000040u) != 0;
+}
+inline void ClusterProto::set_has_hostfile() {
+  _has_bits_[0] |= 0x00000040u;
+}
+inline void ClusterProto::clear_has_hostfile() {
+  _has_bits_[0] &= ~0x00000040u;
+}
+inline void ClusterProto::clear_hostfile() {
+  if (hostfile_ != &::google::protobuf::internal::kEmptyString) {
+    hostfile_->clear();
+  }
+  clear_has_hostfile();
+}
+inline const ::std::string& ClusterProto::hostfile() const {
+  return *hostfile_;
+}
+inline void ClusterProto::set_hostfile(const ::std::string& value) {
+  set_has_hostfile();
+  if (hostfile_ == &::google::protobuf::internal::kEmptyString) {
+    hostfile_ = new ::std::string;
+  }
+  hostfile_->assign(value);
+}
+inline void ClusterProto::set_hostfile(const char* value) {
+  set_has_hostfile();
+  if (hostfile_ == &::google::protobuf::internal::kEmptyString) {
+    hostfile_ = new ::std::string;
+  }
+  hostfile_->assign(value);
+}
+inline void ClusterProto::set_hostfile(const char* value, size_t size) {
+  set_has_hostfile();
+  if (hostfile_ == &::google::protobuf::internal::kEmptyString) {
+    hostfile_ = new ::std::string;
+  }
+  hostfile_->assign(reinterpret_cast<const char*>(value), size);
+}
+inline ::std::string* ClusterProto::mutable_hostfile() {
+  set_has_hostfile();
+  if (hostfile_ == &::google::protobuf::internal::kEmptyString) {
+    hostfile_ = new ::std::string;
+  }
+  return hostfile_;
+}
+inline ::std::string* ClusterProto::release_hostfile() {
+  clear_has_hostfile();
+  if (hostfile_ == &::google::protobuf::internal::kEmptyString) {
+    return NULL;
+  } else {
+    ::std::string* temp = hostfile_;
+    hostfile_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+    return temp;
+  }
+}
+inline void ClusterProto::set_allocated_hostfile(::std::string* hostfile) {
+  if (hostfile_ != &::google::protobuf::internal::kEmptyString) {
+    delete hostfile_;
+  }
+  if (hostfile) {
+    set_has_hostfile();
+    hostfile_ = hostfile;
+  } else {
+    clear_has_hostfile();
+    hostfile_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+  }
+}
+
+// optional bool server_worker_separate = 11 [default = false];
+inline bool ClusterProto::has_server_worker_separate() const {
+  return (_has_bits_[0] & 0x00000080u) != 0;
+}
+inline void ClusterProto::set_has_server_worker_separate() {
+  _has_bits_[0] |= 0x00000080u;
+}
+inline void ClusterProto::clear_has_server_worker_separate() {
+  _has_bits_[0] &= ~0x00000080u;
+}
+inline void ClusterProto::clear_server_worker_separate() {
+  server_worker_separate_ = false;
+  clear_has_server_worker_separate();
+}
+inline bool ClusterProto::server_worker_separate() const {
+  return server_worker_separate_;
+}
+inline void ClusterProto::set_server_worker_separate(bool value) {
+  set_has_server_worker_separate();
+  server_worker_separate_ = value;
+}
+
+// optional int32 nprocs = 12;
+inline bool ClusterProto::has_nprocs() const {
+  return (_has_bits_[0] & 0x00000100u) != 0;
+}
+inline void ClusterProto::set_has_nprocs() {
+  _has_bits_[0] |= 0x00000100u;
+}
+inline void ClusterProto::clear_has_nprocs() {
+  _has_bits_[0] &= ~0x00000100u;
+}
+inline void ClusterProto::clear_nprocs() {
+  nprocs_ = 0;
+  clear_has_nprocs();
+}
+inline ::google::protobuf::int32 ClusterProto::nprocs() const {
+  return nprocs_;
+}
+inline void ClusterProto::set_nprocs(::google::protobuf::int32 value) {
+  set_has_nprocs();
+  nprocs_ = value;
+}
+
+// optional int32 start_port = 13 [default = 6723];
+inline bool ClusterProto::has_start_port() const {
+  return (_has_bits_[0] & 0x00000200u) != 0;
+}
+inline void ClusterProto::set_has_start_port() {
+  _has_bits_[0] |= 0x00000200u;
+}
+inline void ClusterProto::clear_has_start_port() {
+  _has_bits_[0] &= ~0x00000200u;
+}
+inline void ClusterProto::clear_start_port() {
+  start_port_ = 6723;
+  clear_has_start_port();
+}
+inline ::google::protobuf::int32 ClusterProto::start_port() const {
+  return start_port_;
+}
+inline void ClusterProto::set_start_port(::google::protobuf::int32 value) {
+  set_has_start_port();
+  start_port_ = value;
+}
+
+// required string workspace = 14;
+inline bool ClusterProto::has_workspace() const {
+  return (_has_bits_[0] & 0x00000400u) != 0;
+}
+inline void ClusterProto::set_has_workspace() {
+  _has_bits_[0] |= 0x00000400u;
+}
+inline void ClusterProto::clear_has_workspace() {
+  _has_bits_[0] &= ~0x00000400u;
+}
+inline void ClusterProto::clear_workspace() {
+  if (workspace_ != &::google::protobuf::internal::kEmptyString) {
+    workspace_->clear();
+  }
+  clear_has_workspace();
+}
+inline const ::std::string& ClusterProto::workspace() const {
+  return *workspace_;
+}
+inline void ClusterProto::set_workspace(const ::std::string& value) {
+  set_has_workspace();
+  if (workspace_ == &::google::protobuf::internal::kEmptyString) {
+    workspace_ = new ::std::string;
+  }
+  workspace_->assign(value);
+}
+inline void ClusterProto::set_workspace(const char* value) {
+  set_has_workspace();
+  if (workspace_ == &::google::protobuf::internal::kEmptyString) {
+    workspace_ = new ::std::string;
+  }
+  workspace_->assign(value);
+}
+inline void ClusterProto::set_workspace(const char* value, size_t size) {
+  set_has_workspace();
+  if (workspace_ == &::google::protobuf::internal::kEmptyString) {
+    workspace_ = new ::std::string;
+  }
+  workspace_->assign(reinterpret_cast<const char*>(value), size);
+}
+inline ::std::string* ClusterProto::mutable_workspace() {
+  set_has_workspace();
+  if (workspace_ == &::google::protobuf::internal::kEmptyString) {
+    workspace_ = new ::std::string;
+  }
+  return workspace_;
+}
+inline ::std::string* ClusterProto::release_workspace() {
+  clear_has_workspace();
+  if (workspace_ == &::google::protobuf::internal::kEmptyString) {
+    return NULL;
+  } else {
+    ::std::string* temp = workspace_;
+    workspace_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+    return temp;
+  }
+}
+inline void ClusterProto::set_allocated_workspace(::std::string* workspace) {
+  if (workspace_ != &::google::protobuf::internal::kEmptyString) {
+    delete workspace_;
+  }
+  if (workspace) {
+    set_has_workspace();
+    workspace_ = workspace;
+  } else {
+    clear_has_workspace();
+    workspace_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+  }
+}
+
+// optional string log_dir = 15;
+inline bool ClusterProto::has_log_dir() const {
+  return (_has_bits_[0] & 0x00000800u) != 0;
+}
+inline void ClusterProto::set_has_log_dir() {
+  _has_bits_[0] |= 0x00000800u;
+}
+inline void ClusterProto::clear_has_log_dir() {
+  _has_bits_[0] &= ~0x00000800u;
+}
+inline void ClusterProto::clear_log_dir() {
+  if (log_dir_ != &::google::protobuf::internal::kEmptyString) {
+    log_dir_->clear();
+  }
+  clear_has_log_dir();
+}
+inline const ::std::string& ClusterProto::log_dir() const {
+  return *log_dir_;
+}
+inline void ClusterProto::set_log_dir(const ::std::string& value) {
+  set_has_log_dir();
+  if (log_dir_ == &::google::protobuf::internal::kEmptyString) {
+    log_dir_ = new ::std::string;
+  }
+  log_dir_->assign(value);
+}
+inline void ClusterProto::set_log_dir(const char* value) {
+  set_has_log_dir();
+  if (log_dir_ == &::google::protobuf::internal::kEmptyString) {
+    log_dir_ = new ::std::string;
+  }
+  log_dir_->assign(value);
+}
+inline void ClusterProto::set_log_dir(const char* value, size_t size) {
+  set_has_log_dir();
+  if (log_dir_ == &::google::protobuf::internal::kEmptyString) {
+    log_dir_ = new ::std::string;
+  }
+  log_dir_->assign(reinterpret_cast<const char*>(value), size);
+}
+inline ::std::string* ClusterProto::mutable_log_dir() {
+  set_has_log_dir();
+  if (log_dir_ == &::google::protobuf::internal::kEmptyString) {
+    log_dir_ = new ::std::string;
+  }
+  return log_dir_;
+}
+inline ::std::string* ClusterProto::release_log_dir() {
+  clear_has_log_dir();
+  if (log_dir_ == &::google::protobuf::internal::kEmptyString) {
+    return NULL;
+  } else {
+    ::std::string* temp = log_dir_;
+    log_dir_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+    return temp;
+  }
+}
+inline void ClusterProto::set_allocated_log_dir(::std::string* log_dir) {
+  if (log_dir_ != &::google::protobuf::internal::kEmptyString) {
+    delete log_dir_;
+  }
+  if (log_dir) {
+    set_has_log_dir();
+    log_dir_ = log_dir;
+  } else {
+    clear_has_log_dir();
+    log_dir_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
+  }
+}
+
+// repeated .singa.ServerTopology server_group = 20;
+inline int ClusterProto::server_group_size() const {
+  return server_group_.size();
+}
+inline void ClusterProto::clear_server_group() {
+  server_group_.Clear();
+}
+inline const ::singa::ServerTopology& ClusterProto::server_group(int index) const {
+  return server_group_.Get(index);
+}
+inline ::singa::ServerTopology* ClusterProto::mutable_server_group(int index) {
+  return server_group_.Mutable(index);
+}
+inline ::singa::ServerTopology* ClusterProto::add_server_group() {
+  return server_group_.Add();
+}
+inline const ::google::protobuf::RepeatedPtrField< ::singa::ServerTopology >&
+ClusterProto::server_group() const {
+  return server_group_;
+}
+inline ::google::protobuf::RepeatedPtrField< ::singa::ServerTopology >*
+ClusterProto::mutable_server_group() {
+  return &server_group_;
+}
+
+// optional int32 stub_timeout = 30 [default = 5000];
+inline bool ClusterProto::has_stub_timeout() const {
+  return (_has_bits_[0] & 0x00002000u) != 0;
+}
+inline void ClusterProto::set_has_stub_timeout() {
+  _has_bits_[0] |= 0x00002000u;
+}
+inline void ClusterProto::clear_has_stub_timeout() {
+  _has_bits_[0] &= ~0x00002000u;
+}
+inline void ClusterProto::clear_stub_timeout() {
+  stub_timeout_ = 5000;
+  clear_has_stub_timeout();
+}
+inline ::google::protobuf::int32 ClusterProto::stub_timeout() const {
+  return stub_timeout_;
+}
+inline void ClusterProto::set_stub_timeout(::google::protobuf::int32 value) {
+  set_has_stub_timeout();
+  stub_timeout_ = value;
+}
+
+// optional int32 worker_timeout = 31 [default = 5000];
+inline bool ClusterProto::has_worker_timeout() const {
+  return (_has_bits_[0] & 0x00004000u) != 0;
+}
+inline void ClusterProto::set_has_worker_timeout() {
+  _has_bits_[0] |= 0x00004000u;
+}
+inline void ClusterProto::clear_has_worker_timeout() {
+  _has_bits_[0] &= ~0x00004000u;
+}
+inline void ClusterProto::clear_worker_timeout() {
+  worker_timeout_ = 5000;
+  clear_has_worker_timeout();
+}
+inline ::google::protobuf::int32 ClusterProto::worker_timeout() const {
+  return worker_timeout_;
+}
+inline void ClusterProto::set_worker_timeout(::google::protobuf::int32 value) {
+  set_has_worker_timeout();
+  worker_timeout_ = value;
+}
+
+// optional int32 server_timeout = 32 [default = 5000];
+inline bool ClusterProto::has_server_timeout() const {
+  return (_has_bits_[0] & 0x00008000u) != 0;
+}
+inline void ClusterProto::set_has_server_timeout() {
+  _has_bits_[0] |= 0x00008000u;
+}
+inline void ClusterProto::clear_has_server_timeout() {
+  _has_bits_[0] &= ~0x00008000u;
+}
+inline void ClusterProto::clear_server_timeout() {
+  server_timeout_ = 5000;
+  clear_has_server_timeout();
+}
+inline ::google::protobuf::int32 ClusterProto::server_timeout() const {
+  return server_timeout_;
+}
+inline void ClusterProto::set_server_timeout(::google::protobuf::int32 value) {
+  set_has_server_timeout();
+  server_timeout_ = value;
+}
+
+// -------------------------------------------------------------------
+
+// ServerTopology
+
+// required int32 id = 1;
+inline bool ServerTopology::has_id() const {
+  return (_has_bits_[0] & 0x00000001u) != 0;
+}
+inline void ServerTopology::set_has_id() {
+  _has_bits_[0] |= 0x00000001u;
+}
+inline void ServerTopology::clear_has_id() {
+  _has_bits_[0] &= ~0x00000001u;
+}
+inline void ServerTopology::clear_id() {
+  id_ = 0;
+  clear_has_id();
+}
+inline ::google::protobuf::int32 ServerTopology::id() const {
+  return id_;
+}
+inline void ServerTopology::set_id(::google::protobuf::int32 value) {
+  set_has_id();
+  id_ = value;
+}
+
+// optional int32 sync_interval = 2;
+inline bool ServerTopology::has_sync_interval() const {
+  return (_has_bits_[0] & 0x00000002u) != 0;
+}
+inline void ServerTopology::set_has_sync_interval() {
+  _has_bits_[0] |= 0x00000002u;
+}
+inline void ServerTopology::clear_has_sync_interval() {
+  _has_bits_[0] &= ~0x00000002u;
+}
+inline void ServerTopology::clear_sync_interval() {
+  sync_interval_ = 0;
+  clear_has_sync_interval();
+}
+inline ::google::protobuf::int32 ServerTopology::sync_interval() const {
+  return sync_interval_;
+}
+inline void ServerTopology::set_sync_interval(::google::protobuf::int32 value) {
+  set_has_sync_interval();
+  sync_interval_ = value;
+}
+
+// repeated int32 neighbor = 3;
+inline int ServerTopology::neighbor_size() const {
+  return neighbor_.size();
+}
+inline void ServerTopology::clear_neighbor() {
+  neighbor_.Clear();
+}
+inline ::google::protobuf::int32 ServerTopology::neighbor(int index) const {
+  return neighbor_.Get(index);
+}
+inline void ServerTopology::set_neighbor(int index, ::google::protobuf::int32 value) {
+  neighbor_.Set(index, value);
+}
+inline void ServerTopology::add_neighbor(::google::protobuf::int32 value) {
+  neighbor_.Add(value);
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+ServerTopology::neighbor() const {
+  return neighbor_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+ServerTopology::mutable_neighbor() {
+  return &neighbor_;
+}
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace singa
+
+#ifndef SWIG
+namespace google {
+namespace protobuf {
+
+
+}  // namespace google
+}  // namespace protobuf
+#endif  // SWIG
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_cluster_2eproto__INCLUDED

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/proto/cluster.proto
----------------------------------------------------------------------
diff --git a/src/proto/cluster.proto b/src/proto/cluster.proto
new file mode 100644
index 0000000..05e1e70
--- /dev/null
+++ b/src/proto/cluster.proto
@@ -0,0 +1,45 @@
+package singa;
+
+message ClusterProto{
+  optional int32 nworker_groups=1;
+  optional int32 nserver_groups=2;
+  optional int32 nworkers_per_group=3 [default=1];
+  optional int32 nservers_per_group=4 [default=1];
+  optional int32 nworkers_per_procs=5 [default=1];
+  optional int32 nservers_per_procs=6 [default=1];
+
+  // Used in standalone mode, one ip or hostname per line
+  // For YARN or Mesos version, the processes are allocted dynamically,
+  // hence no need to specify the hosts statically
+  optional string hostfile=10;
+
+  // servers and workers in different processes?
+  optional bool server_worker_separate=11 [default=false];
+
+  // if configured, must be consistent with the one computed from 1-6
+  optional int32 nprocs=12;
+
+  // port number is used by ZeroMQ
+  optional int32 start_port=13 [default=6723];
+  // local workspace, train/val/test shards, checkpoint files
+  required string workspace=14;
+  // relative path to workspace. if not set, use the default dir of glog
+  optional string log_dir=15;
+  // message size limit, default 1MB
+  // optional int32 largest_message=20 [default=1048576];
+  // optional float bandwidth=21 [default=100];//MB/s
+
+	repeated ServerTopology server_group = 20;
+
+  optional int32 stub_timeout=30 [default=5000];
+  optional int32 worker_timeout=31 [default=5000];
+  optional int32 server_timeout=32 [default=5000];
+}
+
+message ServerTopology{
+  // group id
+	required int32 id = 1;
+	optional int32 sync_interval = 2;
+  // neighbor group id
+	repeated int32 neighbor = 3;
+}


[06/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/common.h
----------------------------------------------------------------------
diff --git a/include/utils/common.h b/include/utils/common.h
new file mode 100644
index 0000000..993c153
--- /dev/null
+++ b/include/utils/common.h
@@ -0,0 +1,51 @@
+#ifndef INCLUDE_UTILS_COMMON_H_
+#define INCLUDE_UTILS_COMMON_H_
+#pragma once
+#include <glog/logging.h>
+#include <gflags/gflags.h>
+#include <google/protobuf/message.h>
+#include <stdarg.h>
+#include <thread>         // std::this_thread::sleep_for
+#include <chrono>
+#include <string>
+#include <vector>
+#include <mutex>
+#include <queue>
+#include <sys/stat.h>
+#include <map>
+
+using std::vector;
+using std::string;
+using std::map;
+using google::protobuf::Message;
+
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif  // GFLAGS_GFLAGS_H_
+
+
+namespace singa {
+
+void ReadProtoFromTextFile(const char* filename, Message* proto) ;
+void WriteProtoToTextFile(const Message& proto, const char* filename) ;
+void ReadProtoFromBinaryFile(const char* filename, Message* proto) ;
+void WriteProtoToBinaryFile(const Message& proto, const char* filename);
+
+std::string IntVecToString(const vector<int>& vec) ;
+string StringPrintf(string fmt, ...) ;
+void Debug() ;
+inline bool check_exists(const std::string& name) {
+    struct stat buffer;
+    return (stat (name.c_str(), &buffer) == 0);
+}
+
+inline void Sleep(int millisec=1){
+  std::this_thread::sleep_for(std::chrono::milliseconds(millisec));
+}
+
+inline float rand_real(){
+  return  static_cast<float>(rand())/(RAND_MAX+1.0f);
+}
+
+} /* singa */
+#endif  // INCLUDE_UTILS_COMMON_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/data_shard.h
----------------------------------------------------------------------
diff --git a/include/utils/data_shard.h b/include/utils/data_shard.h
new file mode 100644
index 0000000..2ebade9
--- /dev/null
+++ b/include/utils/data_shard.h
@@ -0,0 +1,145 @@
+#ifndef INCLUDE_UTILS_SHARD_H_
+#define INCLUDE_UTILS_SHARD_H_
+
+#include <google/protobuf/message.h>
+#include <fstream>
+#include <string>
+#include <unordered_set>
+
+
+using google::protobuf::Message;
+
+namespace singa {
+
+/**
+ * Data shard stores training/validation/test tuples.
+ * Every worker node should have a training shard (validation/test shard
+ * is optional). The shard file for training is
+ * singa::Cluster::workspace()/train/shard.dat; The shard file for validation
+ * is singa::Cluster::workspace()/train/shard.dat; Similar path for test.
+ *
+ * shard.dat consists of a set of unordered tuples. Each tuple is
+ * encoded as [key_len key record_len val] (key_len and record_len are of type
+ * uint32, which indicate the bytes of key and record respectively.
+ *
+ * When Shard obj is created, it will remove the last key if the record size and
+ * key size do not match because the last write of tuple crashed.
+ *
+ * TODO
+ * 1. split one shard into multile shards.
+ * 2. add threading to prefetch and parse records
+ *
+ */
+class DataShard {
+ public:
+  enum {
+    //!< read only mode used in training
+    kRead=0,
+    //!< write mode used in creating shard (will overwrite previous one)
+    kCreate=1,
+    //!< append mode, e.g. used when previous creating crashes
+    kAppend=2
+  };
+
+ public:
+  /**
+   * Init the shard obj.
+   * @folder shard folder (path excluding shard.dat) on worker node
+   * @mode shard open mode, Shard::kRead, Shard::kWrite or Shard::kAppend
+   * @bufsize batch bufsize bytes data for every disk op (read or write),
+   * default is 100MB
+   */
+  DataShard(std::string folder, char mode, int capacity=104857600);
+  ~DataShard();
+
+  /**
+   * read next tuple from the shard.
+   * @key key
+   * @param val record of type Message
+   * @return true if read success otherwise false, e.g., the tuple was not
+   * inserted completely.
+   */
+  bool Next(std::string *key, Message* val);
+  /**
+   * read next tuple from the shard.
+   * @key key tuple key
+   * @param val record of type string
+   * @return true if read success otherwise false, e.g., the tuple was not
+   * inserted completely.
+   */
+  bool Next(std::string *key, std::string* val);
+
+  /**
+   * Append one tuple to the shard.
+   * @param key e.g., image path
+   * @param val
+   * @return reture if sucess, otherwise false, e.g., inserted before
+   */
+  bool Insert(const std::string& key, const Message& tuple);
+  /**
+   * Append one tuple to the shard.
+   * @param key e.g., image path
+   * @param val
+   * @return reture if sucess, otherwise false, e.g., inserted before
+   */
+  bool Insert(const std::string& key, const std::string& tuple);
+  /**
+   * Move the read pointer to the head of the shard file.
+   * Used for repeated reading.
+   */
+  void SeekToFirst();
+  /**
+   * Flush buffered data to disk.
+   * Used only for kCreate or kAppend.
+   */
+  void Flush() ;
+  /**
+   * Iterate through all tuples to get the num of all tuples.
+   * @return num of tuples
+   */
+  const int Count();
+  /**
+   * @return path to shard file
+   */
+  const std::string path(){
+    return path_;
+  }
+
+ protected:
+  /**
+   * Read the next key and prepare buffer for reading value.
+   * @param key
+   * @return length (i.e., bytes) of value field.
+   */
+  int Next(std::string *key);
+  /**
+   * Setup the disk pointer to the right position for append in case that
+   * the pervious write crashes.
+   * @param path shard path.
+   * @return offset (end pos) of the last success written record.
+   */
+  int PrepareForAppend(std::string path);
+  /**
+   * Read data from disk if the current data in the buffer is not a full field.
+   * @param size size of the next field.
+   */
+  bool PrepareNextField(int size);
+
+ private:
+  char mode_;
+  std::string path_;
+  // either ifstream or ofstream
+  std::fstream fdat_;
+  // to avoid replicated record
+  std::unordered_set<std::string> keys_;
+  // internal buffer
+  char* buf_;
+  // offset inside the buf_
+  int offset_;
+  // allocated bytes for the buf_
+  int capacity_;
+  // bytes in buf_, used in reading
+  int bufsize_;
+};
+} /* singa */
+#endif  // INCLUDE_UTILS_SHARD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/factory.h
----------------------------------------------------------------------
diff --git a/include/utils/factory.h b/include/utils/factory.h
new file mode 100644
index 0000000..c8fef32
--- /dev/null
+++ b/include/utils/factory.h
@@ -0,0 +1,57 @@
+#ifndef INCLUDE_UTILS_FACTORY_H_
+#define INCLUDE_UTILS_FACTORY_H_
+#include <glog/logging.h>
+
+#include <functional>
+#include <utility>
+#include <map>
+/**
+ * macro that creats a function which instantiate a subclass instance and
+ * returns pointer to the base class.
+ */
+#define CreateInstance(SubClass, BaseClass) \
+  [](void)->BaseClass* {return new SubClass();}
+
+/**
+ * factory template to generate class (or a sub-class) object  based on id.
+ * 1. register class creation function that generates a class
+ * object based on id.
+ * 2. call Create() func to call the creation function and return
+ * a pointer to the base calss.
+ */
+
+template<typename T>
+class Factory{
+ //template<Factory<T>> friend class Singleton;
+ public:
+  /**
+   * Register functions to create user defined classes.
+   * This function is called by the REGISTER_FACTORY macro.
+   * @param id identifier of the creating function/class
+   * @param create_function a function that creates a layer instance
+   */
+  void Register(const std::string id, std::function<T*(void)> func);
+  /**
+   * create a layer  instance by providing its type
+   * @param type the identifier of the layer to be created
+   */
+  T *Create(const std::string id);
+
+ private:
+  //<! Map that stores the registered creation functions
+  std::map<std::string, std::function<T*(void)>> str2func_;
+};
+
+template<typename T>
+void Factory<T>::Register(const std::string id,
+                                        std::function<T*(void)> func) {
+  str2func_[id] = func;
+}
+
+template<typename T>
+T *Factory<T>::Create(const std::string id) {
+  CHECK(str2func_.find(id) != str2func_.end())
+      << "The creation function for " << id << " has not been registered";
+  return str2func_[id]();
+}
+#endif // INCLUDE_UTILS_FACTORY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/graph.h
----------------------------------------------------------------------
diff --git a/include/utils/graph.h b/include/utils/graph.h
new file mode 100644
index 0000000..ca582b5
--- /dev/null
+++ b/include/utils/graph.h
@@ -0,0 +1,150 @@
+#ifndef INCLUDE_UTILS_GRAPH_H_
+#define INCLUDE_UTILS_GRAPH_H_
+#include <glog/logging.h>
+#include <vector>
+#include <string>
+#include <map>
+#include <stack>
+#include <memory>
+
+using std::vector;
+using std::string;
+using std::map;
+using std::pair;
+using std::shared_ptr;
+using std::make_shared;
+
+
+typedef struct _LayerInfo{
+  // origin identifies the origin of this node, i.e., the corresponding layer
+  string origin;
+  int locationid;// locationidation id;
+  int partitionid;
+  int slice_dimension;
+  int concate_dimension;
+}LayerInfo;
+typedef LayerInfo V;
+
+
+class Node;
+typedef shared_ptr<Node> SNode;
+
+class Node{
+ public:
+  typedef shared_ptr<Node> SNode;
+  Node(string name): name_(name){}
+  Node(string name, const V& v):
+    name_(name), val_(v){}
+
+  void AddDstNode(SNode dstnode){
+    dstnodes_.push_back(dstnode);
+  }
+  void AddSrcNode(SNode srcnode){
+    srcnodes_.push_back(srcnode);
+  }
+
+  void RemoveDstNode(SNode dst){
+    auto iter=dstnodes_.begin();
+    while((*iter)->name_!=dst->name_&&iter!=dstnodes_.end()) iter++;
+    CHECK((*iter)->name_==dst->name_);
+    dstnodes_.erase(iter);
+  }
+  void RemoveSrcNode(SNode src){
+    auto iter=srcnodes_.begin();
+    while((*iter)->name_!=src->name_&&iter!=srcnodes_.end()) iter++;
+    CHECK((*iter)->name_==src->name_);
+    srcnodes_.erase(iter);
+  }
+  const string& name() const {return name_;}
+  const V& val() const {return val_;}
+  const SNode srcnodes(int k) const {return srcnodes_[k]; }
+  const SNode dstnodes(int k) const {return dstnodes_[k]; }
+  const vector<SNode>& srcnodes() const {return srcnodes_; }
+  const vector<SNode>& dstnodes() const {return dstnodes_; }
+  int  dstnodes_size() const {return dstnodes_.size(); }
+  int  srcnodes_size() const {return srcnodes_.size(); }
+
+ private:
+  string name_;
+  vector<SNode> srcnodes_;
+  vector<SNode> dstnodes_;
+
+  V val_;
+    // properties
+  string color_, weight_, shape_;
+};
+
+
+/**
+ * For partition neuralnet and displaying the neuralnet structure
+ */
+class Graph{
+ public:
+  Graph(){}
+  void Sort();
+  const SNode& AddNode(string name, V origin){
+    nodes_.push_back(make_shared<Node>(name, origin));
+    name2node_[name]=nodes_.back();
+    return nodes_.back();
+  }
+  const SNode& AddNode(string name){
+    nodes_.push_back(make_shared<Node>(name));
+    name2node_[name]=nodes_.back();
+    return nodes_.back();
+  }
+
+  void AddEdge(SNode srcnode, SNode dstnode){
+    srcnode->AddDstNode(dstnode);
+    dstnode->AddSrcNode(srcnode);
+  }
+
+  void AddEdge(const string& src, const string& dst){
+    CHECK(name2node_.find(src)!=name2node_.end())<<"can't find src node "<<src;
+    CHECK(name2node_.find(dst)!=name2node_.end())<<"can't find dst node "<<dst;
+
+    SNode srcnode=name2node_[src], dstnode=name2node_[dst];
+    AddEdge(srcnode, dstnode);
+  }
+
+  void RemoveEdge(const string &src, const string& dst){
+    CHECK(name2node_.find(src)!=name2node_.end())<<"can't find src node "<<src;
+    CHECK(name2node_.find(dst)!=name2node_.end())<<"can't find dst node "<<dst;
+
+    SNode srcnode=name2node_[src], dstnode=name2node_[dst];
+    RemoveEdge(srcnode, dstnode);
+  }
+
+  void RemoveEdge(SNode src, SNode dst){
+    src->RemoveDstNode(dst);
+    dst->RemoveSrcNode(src);
+  }
+
+  const vector<SNode>& nodes() const{
+    return nodes_;
+  };
+
+  const SNode& node(string name) const{
+    CHECK(name2node_.find(name)!= name2node_.end())
+      <<"can't find dst node "<<name;
+    return name2node_.at(name);
+  }
+
+  const string ToString() const;
+  const string ToString(const map<string, string>& info) const ;
+
+  bool Check() const;
+
+  SNode InsertSliceNode(SNode srcnode, const vector<SNode>& dstnodes,
+      const V& info, bool connect_dst=true);
+  SNode InsertConcateNode(const vector<SNode>&srcnodes, SNode dstnode,
+      const V& info);
+  SNode InsertSplitNode(SNode srcnode, const vector<SNode>& dstnodes);
+  std::pair<SNode, SNode> InsertBridgeNode(SNode srcnode, SNode dstnode);
+  void topology_sort_inner(SNode node, map<string, bool> *visited,
+    std::stack<string> *stack);
+
+ private:
+  vector<SNode> nodes_;
+  map<string, SNode> name2node_;
+};
+#endif // INCLUDE_UTILS_GRAPH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
new file mode 100644
index 0000000..907ef8c
--- /dev/null
+++ b/include/utils/param.h
@@ -0,0 +1,172 @@
+#ifndef INCLUDE_UTILS_PARAM_H_
+#define INCLUDE_UTILS_PARAM_H_
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include "proto/model.pb.h"
+#include "utils/blob.h"
+#include "communication/msg.h"
+// Base paramter class.
+namespace singa {
+class Param {
+ public:
+  Param();
+  virtual ~Param();
+
+  virtual Msg* GenGetMsg(void* arg=nullptr);
+  virtual Msg* GenPutMsg(void* arg=nullptr);
+  virtual Msg* GenUpdateMsg(void* arg=nullptr);
+  virtual Msg* GenSyncMsg(void* arg=nullptr);
+
+  virtual Msg* HandleGetMsg(Msg** msg);
+  virtual Msg* HandlePutMsg(Msg** msg);
+  virtual int ParseUpdateMsg(Msg** msg);
+  virtual Msg* GenUpdateResponseMsg(void* arg=nullptr);
+  virtual Msg* HandleSyncMsg(Msg** msg);
+
+  virtual int ParseGetResponseMsg(Msg** msg);
+  virtual int ParsePutResponseMsg(Msg** msg);
+  virtual int ParseUpdateResponseMsg(Msg** msg);
+  virtual int ParseSyncResponseMsg(Msg** msg);
+
+  /**
+   * setup param shape
+   */
+  virtual void Setup(const ParamProto& proto, const std::vector<int>& shape, int fan_in);
+  /*
+   * fill the data according to initmethod, i.e., random/gaussian/fixed value
+   */
+  virtual void Init(int v=0);
+  void ShareData(shared_ptr<Param> other){
+    owner_=other->id();
+    CHECK(std::equal(data_.shape().begin(), data_.shape().end(),
+          other->data_.shape().begin()));
+    data_.ShareData(other->data_);
+  }
+  float learning_rate_multiplier() {
+    return proto_.learning_rate_multiplier();
+  }
+  float weight_decay_multiplier() {
+    return proto_.weight_decay_multiplier();
+  }
+  /*
+  const int split_threshold(){
+    return proto_.split_threshold();
+  }
+  */
+  /**
+   * if the Param shares data with others, then point to the owner.
+   * otherwise points to itself.
+   */
+  const int owner() const{
+    return owner_;
+  }
+  const std::string& name() {
+    return proto_.name();
+  }
+
+  int id() const{
+    return proto_.id();
+  }
+  void set_id(int id){
+    proto_.set_id(id);
+  }
+
+  int version() const {
+    return proto_.version(); // TODO store version in data blob
+  }
+  void set_version(int v) {
+    proto_.set_version(v); // TODO read version from data blob
+  }
+   /**
+    * @return num of floats.
+    */
+  int size() const {
+    return data_.count();
+  }
+  /**
+   * Return const mem address for the content of this parameter
+   */
+  const Blob<float> &data() {
+    return data_;
+  }
+  Blob<float> *mutable_data() {
+    return &data_;
+  }
+  /**
+   * Return gradient of this parameter
+   */
+  const Blob<float> &grad() {
+    return grad_;
+  }
+  Blob<float> *mutable_grad() {
+    return &grad_;
+  }
+
+  const Blob<float> &history() {
+    return history_;
+  }
+  Blob<float> *mutable_history() {
+    return &history_;
+  }
+
+  float* mutable_cpu_data(){
+    return data_.mutable_cpu_data();
+  }
+  float* mutable_cpu_grad(){
+    return grad_.mutable_cpu_data();
+  }
+  float* mutable_cpu_history(){
+    return history_.mutable_cpu_data();
+  }
+ protected:
+  /**
+   * name of the parameter used to share wights between neuralnets
+   */
+  std::string name_;
+  //! content, gradient, history gradient of this parameter
+  Blob<float> data_, grad_, history_;
+  int owner_;
+
+  ParamProto proto_;
+  int fan_in_;
+};
+/**
+ * Sync with server by randomly sampling some parameters for every sync.
+class RandomSyncParam: public Param{
+ public:
+  virtual zmsg_t* HandleSyncMsg(zmsg_t** msg);
+  virtual zmsg_t *GenSyncMsgFromWorker(float sample_ratio);
+  virtual void ParseSyncMsgFromPS(zmsg_t** msg);
+  virtual void Setup(const ParamProto& proto, const vector<int>& shape, int fan_in);
+  virtual void Init();
+
+  float* mutable_cpu_snapshot(){
+    return snapshot_.mutable_cpu_data();
+  }
+  const float* cpu_snapshot(){
+    return snapshot_.cpu_data();
+  }
+
+ protected:
+  const vector<int> RandomSample(int seed, int m, int n);
+
+
+  Blob<float> snapshot_;
+};
+ */
+/**
+ * Sync with server by elastic SGD see http://arxiv.org/abs/1412.6651.
+class ElasticParam: public Param{
+ public:
+  virtual zmsg_t* HandleSyncMsg(zmsg_t** msg);
+  virtual zmsg_t *GenSyncMsgFromWorker(float moving_rate);
+  virtual void ParseSyncMsgFromPS(zmsg_t** msg);
+};
+ */
+
+
+}  // namespace singa
+
+#endif  // INCLUDE_UTILS_PARAM_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/singleton.h
----------------------------------------------------------------------
diff --git a/include/utils/singleton.h b/include/utils/singleton.h
new file mode 100644
index 0000000..2e2bdfb
--- /dev/null
+++ b/include/utils/singleton.h
@@ -0,0 +1,41 @@
+#ifndef INCLUDE_UTILS_SINGLETON_H_
+#define INCLUDE_UTILS_SINGLETON_H_
+
+template<typename T>
+class Singleton {
+ public:
+  static T* Instance() {
+    if (data_==nullptr) {
+      data_ = new T();
+    }
+    return data_;
+  }
+ private:
+  static T* data_;
+};
+
+template<typename T> T* Singleton<T>::data_ = nullptr;
+
+
+/**
+ * Singleton initiated with argument
+ */
+template<typename T, typename X=int>
+class ASingleton {
+ public:
+  static T* Instance(){
+    return data_;
+  }
+  static T* Instance(X x) {
+    if (data_==nullptr) {
+      data_ = new T(x);
+    }
+    return data_;
+  }
+ private:
+  static T* data_;
+};
+
+template<typename T, typename X> T* ASingleton<T,X>::data_ = nullptr;
+
+#endif // INCLUDE_UTILS_SINGLETON_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/utils/updater.h b/include/utils/updater.h
new file mode 100644
index 0000000..2a6dd43
--- /dev/null
+++ b/include/utils/updater.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDE_UTILS_UPDATER_H_
+#define INCLUDE_UTILS_UPDATER_H_
+#include "proto/model.pb.h"
+#include "utils/param.h"
+
+namespace singa{
+/**
+ * Updater for Param.
+ */
+class Updater{
+ public:
+  virtual void Init(const UpdaterProto &proto){
+    proto_=proto;
+  }
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f)=0;
+
+  float GetLearningRate(int step);
+ protected:
+  UpdaterProto proto_;
+};
+class SGDUpdater : public Updater{
+ public:
+  virtual void Init(const UpdaterProto& proto);
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+
+ protected:
+  float base_lr_;
+  float momentum_;
+  float weight_decay_;
+};
+class NesterovUpdater : public Updater{
+ public:
+  virtual void Init(const UpdaterProto& proto);
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+
+ protected:
+  float base_lr_;
+  float momentum_;
+  float weight_decay_;
+};
+class AdaGradUpdater : public Updater{
+ public:
+  virtual void Init(const UpdaterProto& proto);
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+
+ protected:
+  float base_lr_;
+  float delta_;
+  float weight_decay_;
+};
+
+class RMSPropUpdater : public Updater{
+ public:
+  virtual void Init(const UpdaterProto& proto);
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+
+ protected:
+  float base_lr_;
+  float delta_;
+  float rho_;
+  float weight_decay_;
+};
+
+/*
+class AdaDeltaUpdater : public Updater{
+ public:
+  virtual void Init(const UpdaterProto& proto);
+  virtual void Update(int step, shared_ptr<Param> param, float grad_scale=1.0f);
+
+ protected:
+  float rho_;
+  float delta_;
+  float weight_decay_;
+};
+*/
+}
+
+#endif // INCLUDE_UTILS_UPDATER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/script/node.sh
----------------------------------------------------------------------
diff --git a/script/node.sh b/script/node.sh
new file mode 100755
index 0000000..74e0d8a
--- /dev/null
+++ b/script/node.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+if [[ $# < 2 || ! -f $2 ]]
+then
+  echo "Usage: process/folder management"
+  echo "[cat, create, delete, kill, ls, ps, reset, scp, ssh] hostfile [args]"
+  echo "   cat hostfile file--- cat the file on every node in hostfile"
+  echo "   create hostfile folder--- create the folder on every node in hostfile"
+  echo "   delete hostfile folder--- delete the folder on every node in hostfile"
+  echo "   kill hostfile job_name---  kill the job on every node in hostfile"
+  echo "   ls hostfile folder--- list the folder on every node in hostfile"
+  echo "   ps hostfile job_name---  ps aux|grep job_name on every node in hostfile"
+  echo "   reset hostfile folder--- delete and create the folder on every node in hostfile"
+  echo "   scp hostfile local_dir [remote_dir]--- copy the local_dir to remote_dir on every node in hostfile, if remote_dir is omitted, remote_dir=local_dir"
+  echo "   ssh hostfile--- test whether the nodes in hostfile are alive"
+  echo "each line in hostfile is a node name followed by a space and other fields"
+  exit
+fi
+
+ssh_options="-oStrictHostKeyChecking=no \
+-oUserKnownHostsFile=/dev/null \
+-oLogLevel=quiet"
+
+hosts=(`cat $2 |cut -d ' ' -f 1`)
+
+for i in ${hosts[@]}
+do
+  if [ $1 == "cat" ]
+  then
+    cmd="cat $3"
+  elif [ $1 == "create" -o $1 == "reset" ]
+  then
+    cmd="mkdir -p $3"
+  elif [ $1 == "delete" -o $1 == "reset" ]
+  then
+    cmd="rm -rf $3"
+  elif [ $1 == "kill" ]
+  then
+    cmd="ps ax|pgrep $3 |xargs kill"
+  elif [ $1 == "ls" ]
+  then
+    cmd="ls -l $3"
+  elif [ $1 == "scp" ]
+  then
+    local_dir=$3
+    remote_dir=$3
+    if [ $# -eq 4 ]
+    then
+      remote_dir=$4
+    fi
+    r=''
+    if [[ -d $3 ]]
+    then
+      r='-r'
+    fi
+    echo "scp $r $local_dir $i:$remote_dir"
+    scp $r $local_dir $i:$remote_dir
+  elif [ $1 == "ssh" ]
+  then
+    cmd="exit"
+  elif [ $1 == "ps" ]
+  then
+    cmd="ps ax|pgrep $3"
+  else
+    echo "Incorrect commands:" $1
+  fi
+  if [ $1 != "scp" ]
+  then
+    echo $cmd
+    ssh $i $cmd
+  fi
+done

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/communication/msg.cc
----------------------------------------------------------------------
diff --git a/src/communication/msg.cc b/src/communication/msg.cc
new file mode 100644
index 0000000..80f2304
--- /dev/null
+++ b/src/communication/msg.cc
@@ -0,0 +1,5 @@
+#include "communication/msg.h"
+
+namespace singa {
+} /* singa */
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/communication/socket.cc
----------------------------------------------------------------------
diff --git a/src/communication/socket.cc b/src/communication/socket.cc
new file mode 100644
index 0000000..279d758
--- /dev/null
+++ b/src/communication/socket.cc
@@ -0,0 +1,118 @@
+#include "communication/socket.h"
+
+namespace singa {
+Poller::Poller(){
+  poller_=zpoller_new(NULL);
+}
+
+void Poller::Add(Socket* socket){
+  zsock_t* zsock=static_cast<zsock_t*>(socket->InternalID());
+  zpoller_add(poller_, zsock);
+  zsock2Socket_[zsock]=socket;
+}
+
+Socket* Poller::Wait(int timeout){
+  zsock_t* sock=(zsock_t*)zpoller_wait(poller_, timeout);
+  if(sock!=NULL)
+    return zsock2Socket_[sock];
+  else return nullptr;
+}
+
+Dealer::Dealer(int id):id_(id){
+  dealer_=zsock_new(ZMQ_DEALER);
+  CHECK_NOTNULL(dealer_);
+  poller_=zpoller_new(dealer_);
+}
+
+int Dealer::Connect(string endpoint){
+  if(endpoint.length())
+    CHECK_EQ(zsock_connect(dealer_,endpoint.c_str()),0);
+  return 1;
+}
+int Dealer::Send(Msg *msg){
+  zmsg_t* zmsg=(static_cast<Msg*>(msg))->DumpToZmsg();
+  zmsg_send(&zmsg, dealer_);
+  delete msg;
+  return 1;
+}
+
+Msg* Dealer::Receive(){
+  zmsg_t* zmsg=zmsg_recv(dealer_);
+  if(zmsg==NULL)
+    return nullptr;
+  Msg* msg=new Msg();
+  msg->ParseFromZmsg(zmsg);
+  return msg;
+}
+Dealer::~Dealer(){
+  zsock_destroy(&dealer_);
+}
+
+Router::Router(int bufsize){
+  nBufmsg_=0;
+  bufsize_=bufsize;
+  router_=zsock_new(ZMQ_ROUTER);
+  CHECK_NOTNULL(router_);
+  poller_=zpoller_new(router_);
+}
+int Router::Bind(string endpoint){
+  if(endpoint.length())
+    CHECK_EQ(zsock_bind(router_, endpoint.c_str()),0);
+  return 1;
+}
+
+int Router::Send(Msg *msg){
+  zmsg_t* zmsg=static_cast<Msg*>(msg)->DumpToZmsg();
+  int dstid=static_cast<Msg*>(msg)->dst();
+  if(id2addr_.find(dstid)!=id2addr_.end()){
+    // the connection has already been set up
+    zframe_t* addr=zframe_dup(id2addr_[dstid]);
+    zmsg_prepend(zmsg, &addr);
+    zmsg_send(&zmsg, router_);
+  }else{
+    // the connection is not ready, buffer the message
+    if(bufmsg_.size()==0)
+      nBufmsg_=0;
+    bufmsg_[dstid].push_back(zmsg);
+    nBufmsg_++;
+    CHECK_LE(nBufmsg_, bufsize_);
+  }
+  delete msg;
+  return 1;
+}
+
+Msg* Router::Receive(){
+  zmsg_t* zmsg=zmsg_recv(router_);
+  if(zmsg==NULL)
+    return nullptr;
+  zframe_t* dealer=zmsg_pop(zmsg);
+  Msg* msg=new Msg();
+  msg->ParseFromZmsg(zmsg);
+  if (id2addr_.find(msg->src())==id2addr_.end()){
+    // new connection, store the sender's identfier and send buffered messages
+    // for it
+    id2addr_[msg->src()]=dealer;
+    if(bufmsg_.find(msg->src())!=bufmsg_.end()){
+      for(auto& it: bufmsg_.at(msg->src())){
+        zframe_t* addr=zframe_dup(dealer);
+        zmsg_prepend(it, &addr);
+        zmsg_send(&it, router_);
+      }
+      bufmsg_.erase(msg->src());
+    }
+  }
+  else
+    zframe_destroy(&dealer);
+  return msg;
+}
+
+Router::~Router(){
+  zsock_destroy(&router_);
+  for(auto it: id2addr_)
+    zframe_destroy(&it.second);
+  for(auto it: bufmsg_){
+    for(auto *msg: it.second)
+      zmsg_destroy(&msg);
+  }
+}
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/main.cc
----------------------------------------------------------------------
diff --git a/src/main.cc b/src/main.cc
new file mode 100644
index 0000000..89306d8
--- /dev/null
+++ b/src/main.cc
@@ -0,0 +1,49 @@
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "trainer/trainer.h"
+
+/**
+ * \file main.cc is the main entry of SINGA, like the driver program for Hadoop.
+ *
+ * 1. Users register their own implemented classes, e.g., layer, updater, etc.
+ * 2. Users prepare the google protobuf object for the model configuration and
+ * the cluster configuration.
+ * 3. Users call trainer to start the training.
+ *
+ * TODO
+ * 1. Add the resume function to continue training from a previously stopped
+ * point.
+ * 2. Add helper functions for users to configure their model and cluster
+ * easily, e.g., AddLayer(layer_type, source_layers, meta_data).
+ */
+
+DEFINE_int32(procsID, 0, "Global process ID");
+DEFINE_string(cluster, "examples/mnist/cluster.conf", "Cluster config file");
+DEFINE_string(model, "examples/mnist/conv.conf", "Model config file");
+
+/**
+ * Register layers, and other customizable classes.
+ *
+ * If users want to use their own implemented classes, they should register
+ * them here. Refer to the Worker::RegisterDefaultClasses()
+ */
+void RegisterClasses(const singa::ModelProto& proto){
+}
+
+int main(int argc, char **argv) {
+  // TODO set log dir
+  google::InitGoogleLogging(argv[0]);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  singa::ClusterProto cluster;
+  singa::ReadProtoFromTextFile(FLAGS_cluster.c_str(), &cluster);
+  singa::ModelProto model;
+  singa::ReadProtoFromTextFile(FLAGS_model.c_str(), &model);
+  LOG(INFO)<<"The cluster config is\n"<<cluster.DebugString();
+  LOG(INFO)<<"The model config is\n"<<model.DebugString();
+
+  RegisterClasses(model);
+  singa::Trainer trainer;
+  trainer.Start(model, cluster, FLAGS_procsID);
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/neuralnet/base_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc
new file mode 100644
index 0000000..50fc396
--- /dev/null
+++ b/src/neuralnet/base_layer.cc
@@ -0,0 +1,194 @@
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <cblas.h>
+#include <math.h>
+#include <cfloat>
+#include "neuralnet/base_layer.h"
+namespace singa {
+/*****************************************************************************
+ * Implementation for Layer
+ *****************************************************************************/
+void Layer::Init(const LayerProto &proto) {
+  layer_proto_=proto;
+}
+
+void Layer::Init(const Layer& other, const vector<int>& shape){
+  data_.Reshape(shape);
+  grad_.Reshape(shape);
+  layer_proto_=other.layer_proto_;
+}
+void Layer::Setup(){
+  Setup(layer_proto_, srclayers_);
+}
+void Layer::SetupAfterPartition(){
+  vector<int> shape=data_.shape();
+  SetupAfterPartition(layer_proto_, shape, srclayers_);
+  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+  CHECK(std::equal(shape.begin(), shape.end(), data_.shape().begin()))<<name()
+    <<IntVecToString(shape)<<"--"<<IntVecToString(data_.shape());
+}
+void Layer::ComputeFeature(bool training){
+  ComputeFeature(training, srclayers_);
+}
+void Layer::ComputeGradient(){
+  ComputeGradient(srclayers_);
+}
+
+void Layer::ToProto(LayerProto *proto, bool copyData) {
+}
+void BridgeSrcLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.ReshapeLike(data_);
+}
+void BridgeSrcLayer::SetupAfterPartition(){
+  Setup(layer_proto_, srclayers_);
+  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+}
+
+void BridgeSrcLayer::ComputeFeature(bool training,
+    const vector<SLayer>& srclayers){
+  if(training)
+    ready_=false;
+  else
+    ready_=true;
+}
+void BridgeSrcLayer::ComputeGradient(const vector<SLayer>& srclayers){
+
+}
+void BridgeDstLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.ReshapeLike(data_);
+}
+void BridgeDstLayer::SetupAfterPartition(){
+  Setup(layer_proto_, srclayers_);
+  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+}
+
+void BridgeDstLayer::ComputeFeature(bool training,
+    const vector<SLayer>& srclayers){
+  if(training)
+    ready_=true;
+  else
+    ready_=false;
+}
+void BridgeDstLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){
+
+}
+
+/*******************************
+ * Implementation for ConcateLayer
+ *******************************/
+void ConcateLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  size_t concate_dim=proto.concate_param().concate_dimension();
+  CHECK_GE(concate_dim,0);
+  CHECK_GT(srclayers.size(),1);
+  vector<int> shape=srclayers[0]->data(this).shape();
+  for(size_t i=1;i<srclayers.size();i++){
+    const vector<int>& srcshape=srclayers[i]->data(this).shape();
+    for(size_t j=0;j<shape.size();j++)
+      if(j==concate_dim)
+        shape[j]+=srcshape[j];
+      else
+        CHECK_EQ(shape[j], srcshape[j]);
+  }
+  data_.Reshape(shape);
+  grad_.Reshape(shape);
+}
+
+void ConcateLayer::SetupAfterPartition(){
+  Setup(layer_proto_, srclayers_);
+//  LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+}
+
+void ConcateLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){}
+
+void ConcateLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){}
+/*****************************************************************************
+ * Implementation for SliceLayer
+ *****************************************************************************/
+void SliceLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  int slice_dim=proto.slice_param().slice_dimension();
+  int slice_num=proto.slice_param().slice_num();
+  CHECK_GE(slice_dim,0);
+  CHECK_EQ(slice_num, dstlayers_.size());
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.ReshapeLike(data_);
+  datavec_.resize(slice_num);
+  gradvec_.resize(slice_num);
+  //LOG(ERROR)<<"slice dim "<<slice_dim<<" slice num "<<slice_num;
+  for(int i=0;i<slice_num;i++){
+    vector<int> newshape(data_.shape());
+    newshape[slice_dim]=newshape[slice_dim]/slice_num+
+      ((i==slice_num-1)?newshape[slice_dim]%slice_num:0);
+    datavec_[i].Reshape(newshape);
+    gradvec_[i].Reshape(newshape);
+    //LOG(ERROR)<<"slice "<<IntVecToString(newshape);
+  }
+}
+
+void SliceLayer::SetupAfterPartition(){
+  Setup(layer_proto_, srclayers_);
+  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+}
+
+
+int SliceLayer::SliceID(const Layer* layer) const {
+  CHECK(layer!= nullptr);
+  for(size_t i=0;i<datavec_.size();i++){
+    //LOG(ERROR)<<"get slice "<<IntVecToString(shapes_[i]);
+    if(dstlayers_[i].get() == layer)
+      return i;
+  }
+  CHECK(false);
+  return -1;
+}
+
+const Blob<float>& SliceLayer::data(const Layer* layer) const {
+  if(layer==nullptr)
+    return data_;
+  return datavec_[SliceID(layer)];
+}
+const Blob<float>& SliceLayer::grad(const Layer* layer) const {
+  if(layer==nullptr)
+    return grad_;
+  return gradvec_[SliceID(layer)];
+}
+Blob<float>* SliceLayer::mutable_data(const Layer* layer) {
+  if(layer==nullptr)
+    return &data_;
+  return &datavec_[SliceID(layer)];
+}
+Blob<float>* SliceLayer::mutable_grad(const Layer* layer){
+  if(layer==nullptr)
+    return &grad_;
+  return &gradvec_[SliceID(layer)];
+}
+void SliceLayer::ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers){}
+void SliceLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){}
+
+void SplitLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  grad_.Reshape(srclayers[0]->data(this).shape());
+}
+
+void SplitLayer::SetupAfterPartition(){
+  Setup(layer_proto_, srclayers_);
+  //LOG(ERROR)<<name()<<":"<<IntVecToString(shape_);
+}
+void SplitLayer::ComputeFeature(bool training, const vector<shared_ptr<Layer>>& srclayers){
+
+}
+void SplitLayer::ComputeGradient(const vector<shared_ptr<Layer>>& srclayers){
+
+}
+
+}  // namespace singa
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
new file mode 100644
index 0000000..d45bcc0
--- /dev/null
+++ b/src/neuralnet/layer.cc
@@ -0,0 +1,781 @@
+#include <glog/logging.h>
+#include <memory>
+#include <algorithm>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "mshadow/tensor.h"
+#include "mshadow/cxxnet_op.h"
+#include "neuralnet/layer.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+
+using namespace mshadow;
+using namespace mshadow::expr;
+
+namespace singa {
+
+/************ Implementation for ConvProductLayer*************************/
+void ConvolutionLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  ConvolutionProto conv_param=proto.convolution_param();
+  kernel_=conv_param.kernel();
+  CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
+  pad_=conv_param.pad();
+  stride_=conv_param.stride();
+  num_filters_=conv_param.num_filters();
+  const vector<int>& srcshape=srclayers[0]->data(this).shape();
+  int dim=srcshape.size();
+  CHECK_GT(dim, 2);
+  width_=srcshape[dim-1];
+  height_=srcshape[dim-2];
+  if(dim>3)
+    channels_=srcshape[dim-3];
+  else if(dim>2)
+    channels_=1;
+  batchsize_=srcshape[0];
+  conv_height_=(height_ + 2 * pad_ - kernel_) / stride_ + 1;
+  conv_width_= (width_ + 2 * pad_ - kernel_) / stride_ + 1;
+  col_height_=channels_*kernel_*kernel_;
+  col_width_=conv_height_*conv_width_;
+  vector<int> shape{batchsize_, num_filters_, conv_height_, conv_width_};
+  data_.Reshape(shape);
+  grad_.Reshape(shape);
+  col_data_.Reshape(vector<int>{col_height_, col_width_});
+  col_grad_.Reshape(vector<int>{col_height_, col_width_});
+
+  Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
+  weight_=shared_ptr<Param>(factory->Create("Param"));
+  weight_->Setup(proto.param(0), vector<int>{num_filters_, col_height_}, col_height_);
+  bias_=shared_ptr<Param>(factory->Create("Param"));
+  bias_->Setup(proto.param(1), vector<int>{num_filters_},0);
+}
+
+void ConvolutionLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  LayerProto newproto(proto);
+  ConvolutionProto *conv_param=newproto.mutable_convolution_param();
+  conv_param->set_num_filters(shape[1]);
+  Setup(newproto, srclayers);
+}
+
+void ConvolutionLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+      Shape4(batchsize_, channels_, height_, width_));
+  Tensor<cpu, 3> data(data_.mutable_cpu_data(),
+      Shape3(batchsize_, num_filters_, conv_height_* conv_width_));
+  Tensor<cpu, 2> col(col_data_.mutable_cpu_data(),
+      Shape2(col_height_, col_width_));
+  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(),
+      Shape2(num_filters_, col_height_));
+  Tensor<cpu, 1> bias(bias_->mutable_cpu_data(),
+      Shape1(num_filters_));
+
+  for(int n=0;n<batchsize_;n++){
+    if(pad_>0)
+      col=unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+    else
+      col=unpack_patch2col(src[n], kernel_, stride_);
+    data[n]=dot(weight, col);
+  }
+  data+=broadcast<1>(bias, data.shape);
+}
+
+void ConvolutionLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+      Shape4(batchsize_, channels_, height_, width_));
+  Tensor<cpu, 2> col(col_data_.mutable_cpu_data(),
+      Shape2(col_height_, col_width_));
+  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(),
+      Shape2(num_filters_, col_height_));
+
+  Blob<float>* gsrcblob=srclayers[0]->mutable_grad(this);
+  Tensor<cpu, 4> gsrc(Shape4(batchsize_, channels_, height_, width_));
+  if(gsrcblob!=nullptr)
+    gsrc.dptr=gsrcblob->mutable_cpu_data();
+  Tensor<cpu, 3> grad(grad_.mutable_cpu_data(),
+      Shape3(batchsize_, num_filters_, conv_height_* conv_width_));
+  Tensor<cpu, 2> gcol(col_grad_.mutable_cpu_data(),
+      Shape2(col_height_, col_width_));
+  Tensor<cpu, 2> gweight(weight_->mutable_cpu_grad(),
+      Shape2(num_filters_, col_height_));
+  Tensor<cpu, 1> gbias(bias_->mutable_cpu_grad(),
+      Shape1(num_filters_));
+
+  gweight=0.0f;
+  gbias=sumall_except_dim<1>(grad);
+  Shape<3> padshape(gsrc.shape.SubShape());
+  padshape[0]+=2*pad_;padshape[1]+=2*pad_;
+  Shape<2> imgshape=Shape2(height_, width_);
+  for(int n=0;n<batchsize_;n++){
+    if(pad_>0)
+      col=unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+    else
+      col=unpack_patch2col(src[n], kernel_, stride_);
+    gweight+=dot(grad[n], col.T());
+
+    if(gsrcblob!=nullptr){
+      gcol=dot(weight.T(), grad[n]);
+      gsrc[n]=crop(pack_col2patch(gcol, padshape, kernel_, stride_), imgshape);
+    }
+  }
+}
+
+/****************** Implementation for DropoutLayer ***********************/
+void DropoutLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
+  mask_.Reshape(srclayers[0]->data(this).shape());
+  pdrop_=proto.dropout_param().dropout_ratio();
+  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+  ASingleton<Random<cpu>>::Instance(seed);
+}
+
+void DropoutLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+
+void DropoutLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers) {
+  // check training
+  if(!training){
+    data_.CopyFrom(srclayers[0]->data());
+    return;
+  }
+  float pkeep=1-pdrop_;
+  Tensor<cpu, 1> mask(mask_.mutable_cpu_data(), Shape1(mask_.count()));
+  mask = F<op::threshold>(ASingleton<Random<cpu>>::Instance()\
+      ->uniform(mask.shape), pkeep ) * (1.0f/pkeep);
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  Blob<float>* srcblob=srclayers[0]->mutable_data();
+  Tensor<cpu, 1> src(srcblob->mutable_cpu_data(), Shape1(srcblob->count()));
+  data=src*mask;
+}
+
+void DropoutLayer::ComputeGradient(const vector<SLayer>& srclayers)  {
+  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(data_.count()));
+  Tensor<cpu, 1> mask(mask_.mutable_cpu_data(), Shape1(mask_.count()));
+  Blob<float>* gsrcblob=srclayers[0]->mutable_grad();
+  Tensor<cpu, 1> gsrc(gsrcblob->mutable_cpu_data(), Shape1(gsrcblob->count()));
+  gsrc=grad*mask;
+}
+/**************** Implementation for InnerProductLayer********************/
+void InnerProductLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  const auto& src=srclayers[0]->data(this);
+  batchsize_=src.shape()[0];
+  vdim_=src.count()/batchsize_;
+  hdim_=proto.inner_product_param().num_output();
+  data_.Reshape(vector<int>{batchsize_, hdim_});
+  grad_.ReshapeLike(data_);
+  Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
+  weight_=shared_ptr<Param>(factory->Create("Param"));
+  bias_=shared_ptr<Param>(factory->Create("Param"));
+  weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_}, vdim_*hdim_);
+  bias_->Setup(proto.param(1), vector<int>{hdim_},0);
+}
+void InnerProductLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  LayerProto newproto(proto);
+  InnerProductProto * innerproto=newproto.mutable_inner_product_param();
+  innerproto->set_num_output(shape[1]);
+  Setup(newproto, srclayers);
+}
+
+void InnerProductLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers) {
+  Tensor<cpu, 2> data(data_.mutable_cpu_data(), Shape2(batchsize_,hdim_));
+  CHECK_EQ(srclayers[0]->data().count(), batchsize_*vdim_);
+  Tensor<cpu, 2> src(srclayers[0]->mutable_data()->mutable_cpu_data(),
+      Shape2(batchsize_,vdim_));
+  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(), Shape2(vdim_,hdim_));
+  Tensor<cpu, 1> bias(bias_->mutable_cpu_data(), Shape1(hdim_));
+  data=dot(src, weight);
+  // repmat: repeat bias vector into batchsize rows
+  data+=repmat(bias, batchsize_);
+}
+
+void InnerProductLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  Tensor<cpu, 2> src(srclayers[0]->mutable_data()->mutable_cpu_data(),
+      Shape2(batchsize_,vdim_));
+  Tensor<cpu, 2> grad(grad_.mutable_cpu_data(),Shape2(batchsize_,hdim_));
+  Tensor<cpu, 2> weight(weight_->mutable_cpu_data(), Shape2(vdim_,hdim_));
+  Tensor<cpu, 2> gweight(weight_->mutable_cpu_grad(), Shape2(vdim_,hdim_));
+  Tensor<cpu, 1> gbias(bias_->mutable_cpu_grad(), Shape1(hdim_));
+
+  gbias=sum_rows(grad);
+  gweight=dot(src.T(), grad);
+  if(srclayers[0]->mutable_grad(this)!=nullptr){
+    Tensor<cpu, 2> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
+        Shape2(batchsize_,vdim_));
+    gsrc=dot(grad, weight.T());
+  }
+}
+/*****************************************************************************
+ * Implementation for LabelLayer
+ *****************************************************************************/
+void LabelLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
+  data_.Reshape(vector<int>{batchsize});
+}
+
+void LabelLayer::ParseRecords(bool training, const vector<Record>& records, Blob<float>* blob){
+  LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
+  float *label= blob->mutable_cpu_data() ;
+  int rid=0;
+  for(const Record& record: records){
+    label[rid++]=record.image().label();
+    CHECK_LT(record.image().label(),10);
+  }
+  CHECK_EQ(rid, blob->shape()[0]);
+}
+
+
+/*********************LMDBDataLayer**********************************/
+void LMDBDataLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  if(random_skip_){
+    int nskip=rand()%random_skip_;
+    int n=0;
+    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
+          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+    while (mdb_cursor_get(mdb_cursor_, &mdb_key_,
+          &mdb_value_, MDB_NEXT) == MDB_SUCCESS)
+      n++;
+    LOG(INFO)<<"Random Skip "<<nskip<<" records of total "<<n<<"records";
+    // We have reached the end. Restart from the first.
+    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
+          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+    for(int i=0;i<nskip;i++){
+      if (mdb_cursor_get(mdb_cursor_, &mdb_key_,
+            &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
+        // We have reached the end. Restart from the first.
+        DLOG(INFO) << "Restarting data prefetching from start.";
+        CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
+              &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+      }
+    }
+    random_skip_=0;
+  }
+  Datum datum;
+  for(auto& record: records_){
+    SingleLabelImageRecord* image=record.mutable_image();
+    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
+          &mdb_value_, MDB_GET_CURRENT), MDB_SUCCESS);
+    datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
+    ConvertDatumToSingleLableImageRecord(datum, image);
+    if (mdb_cursor_get(mdb_cursor_, &mdb_key_,
+          &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
+      // We have reached the end. Restart from the first.
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
+            &mdb_value_, MDB_FIRST), MDB_SUCCESS);
+    }
+  }
+}
+
+void LMDBDataLayer::ConvertDatumToSingleLableImageRecord(const Datum& datum,
+    SingleLabelImageRecord* record){
+  record->set_label(datum.label());
+  record->clear_shape();
+  if(datum.has_channels())
+    record->add_shape(datum.channels());
+  if(datum.has_height())
+    record->add_shape(datum.height());
+  if(datum.has_width())
+    record->add_shape(datum.width());
+  if(datum.has_data())
+    record->set_pixel(datum.data());
+  if(datum.float_data_size()){
+    record->clear_data();
+    for(float x: datum.float_data())
+      record->add_data(x);
+  }
+}
+
+void LMDBDataLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed";
+  CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS); // 1TB
+  CHECK_EQ(mdb_env_open(mdb_env_,
+        proto.data_param().path().c_str(),
+        MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb "
+    << proto.data_param().path();
+  CHECK_EQ(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_), MDB_SUCCESS)
+    << "mdb_txn_begin failed";
+  CHECK_EQ(mdb_open(mdb_txn_, NULL, 0, &mdb_dbi_), MDB_SUCCESS)
+    << "mdb_open failed";
+  CHECK_EQ(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_), MDB_SUCCESS)
+    << "mdb_cursor_open failed";
+  LOG(INFO) << "Opening lmdb " << proto.data_param().path();
+  CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST),
+      MDB_SUCCESS) << "mdb_cursor_get failed";
+
+  if (mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT)
+      != MDB_SUCCESS) {
+    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_,
+          MDB_FIRST), MDB_SUCCESS);
+  }
+  Datum datum;
+  datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
+  SingleLabelImageRecord* record=sample_.mutable_image();
+  ConvertDatumToSingleLableImageRecord(datum, record);
+
+  batchsize_=proto.data_param().batchsize();
+  records_.resize(batchsize_);
+  random_skip_=proto.data_param().random_skip();
+}
+
+/***************** Implementation for LRNLayer *************************/
+void LRNLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  lsize_ = proto.lrn_param().local_size();
+  CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
+  knorm_=proto.lrn_param().knorm();
+  alpha_ = proto.lrn_param().alpha();
+  beta_ = proto.lrn_param().beta();
+
+  const vector<int>& s=srclayers[0]->data(this).shape();
+  data_.Reshape(s);
+  grad_.Reshape(s);
+  norm_.Reshape(s);
+  batchsize_=s[0];
+  channels_=s[1];
+  height_=s[2];
+  width_=s[3];
+}
+
+void LRNLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+
+void LRNLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  const float salpha = alpha_ / lsize_;
+  Shape<4> s=Shape4(batchsize_,channels_, height_, width_);
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
+  Tensor<cpu, 4> data(data_.mutable_cpu_data(), s);
+  Tensor<cpu, 4> norm(norm_.mutable_cpu_data(), s);
+  // stores normalizer without power
+  norm= chpool<red::sum>( F<op::square>(src) , lsize_ ) * salpha + knorm_;
+  data = src * F<op::power>(norm, -beta_ );
+}
+
+void LRNLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  const float salpha = alpha_ / lsize_;
+  Shape<4> s=Shape4(batchsize_,channels_, height_, width_);
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data()->mutable_cpu_data(), s);
+  Tensor<cpu, 4> norm(norm_.mutable_cpu_data(), s);
+  Tensor<cpu, 4> grad(grad_.mutable_cpu_data(), s);
+  Tensor<cpu, 4> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(), s);
+
+  gsrc = grad * F<op::power>( norm, -beta_ );
+  gsrc += ( - 2.0f * beta_ * salpha ) * chpool<red::sum>(
+      grad * src * F<op::power>( norm, -beta_-1.0f ), lsize_ )  * src;
+}
+
+/**************** Implementation for MnistImageLayer******************/
+
+void MnistImageLayer::ParseRecords(bool training, const vector<Record>& records,
+    Blob<float>* blob){
+  LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
+  int ndim=records.at(0).image().shape_size();
+  int inputsize =records.at(0).image().shape(ndim-1);
+
+  float* dptr=blob->mutable_cpu_data();
+  for(const Record& record: records){
+    // copy from record to cv::Mat
+    cv::Mat input(inputsize, inputsize, CV_32FC1);
+    const SingleLabelImageRecord& imagerecord=record.image();
+    if(imagerecord.pixel().size()){
+      string pixel=imagerecord.pixel();
+      for(int i=0,k=0;i<inputsize;i++)
+        for(int j=0;j<inputsize;j++)
+          // NOTE!!! must cast pixel to uint8_t then to float!!! waste a lot of
+          // time to debug this
+          input.at<float>(i,j)=static_cast<float>(static_cast<uint8_t>(pixel[k++]));
+    }else{
+      for(int i=0,k=0;i<inputsize;i++)
+        for(int j=0;j<inputsize;j++)
+          input.at<float>(i,j)=imagerecord.data(k++);
+    }
+    int size=blob->shape()[1];
+    /*
+    cv::Mat resizeMat=input;
+    // affine transform, scaling, rotation and shearing
+    if(gamma_){
+      float r1=rand_real()*2-1;
+      float r2=rand_real()*2-1;
+      int h=static_cast<int>(inputsize*(1.+r1*gamma_/100.0));
+      int w=static_cast<int>(inputsize*(1.+r2*gamma_/100.0));
+      cv::resize(input, resizeMat, cv::Size(h,w));
+    }
+    cv::Mat betaMat=resizeMat;
+    cv::Mat warpmat(2,3, CV_32FC1);
+    warpmat.at<float>(0,0)=1.0;
+    warpmat.at<float>(0,1)=0.0;
+    warpmat.at<float>(0,2)=0.0;
+    warpmat.at<float>(1,0)=0.0;
+    warpmat.at<float>(1,1)=1.0;
+    warpmat.at<float>(1,2)=0.0;
+
+    if(beta_){
+      float r=rand_real()*2-1;
+      if(rand() % 2){ // rotation
+        cv::Point center(resizeMat.rows/2, resizeMat.cols/2);
+        warpmat=cv::getRotationMatrix2D(center, r*beta_, 1.0);
+      }else{
+        //shearing
+        warpmat.at<float>(0,1)=r*beta_/90;
+        if(imagerecord.label()==1 ||imagerecord.label()==7)
+          warpmat.at<float>(0,1)/=2.0;
+      }
+    }
+    cv::warpAffine(resizeMat, betaMat, warpmat, cv::Size(size, size));
+    */
+
+    for(int i=0;i<size;i++){
+      for(int j=0;j<size;j++){
+        *dptr=input.at<float>(i,j)/norm_a_-norm_b_;
+        dptr++;
+      }
+    }
+  }
+  CHECK_EQ(dptr, blob->mutable_cpu_data()+blob->count());
+}
+void MnistImageLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
+  Record sample=static_cast<DataLayer*>(srclayers[0].get())->sample();
+  kernel_=proto.mnist_param().kernel();
+  sigma_=proto.mnist_param().sigma();
+  alpha_=proto.mnist_param().alpha();
+  beta_=proto.mnist_param().beta();
+  gamma_=proto.mnist_param().gamma();
+  resize_=proto.mnist_param().resize();
+  norm_a_=proto.mnist_param().norm_a();
+  norm_b_=proto.mnist_param().norm_b();
+  elastic_freq_=proto.mnist_param().elastic_freq();
+
+  int ndim=sample.image().shape_size();
+  CHECK_GE(ndim,2);
+  if(resize_)
+    data_.Reshape(vector<int>{batchsize, resize_, resize_});
+  else{
+    int s=sample.image().shape(ndim-1);
+    CHECK_EQ(s,sample.image().shape(ndim-2));
+    data_.Reshape(vector<int>{batchsize, s, s });
+  }
+}
+
+/******************** Implementation for PoolingLayer******************/
+void PoolingLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  PoolingProto pool_param = proto.pooling_param();
+  kernel_=pool_param.kernel();
+  stride_=pool_param.stride();
+  CHECK_LT(pad_, kernel_);
+  pool_=proto.pooling_param().pool();
+  CHECK(pool_ == PoolingProto_PoolMethod_AVE
+        || pool_ == PoolingProto_PoolMethod_MAX)
+      << "Padding implemented only for average and max pooling.";
+
+  const auto& srcshape=srclayers[0]->data(this).shape();
+  int dim=srcshape.size();
+  CHECK_GT(dim,2);
+  width_ = srcshape[dim-1];
+  height_ = srcshape[dim-2];
+  if(dim>3)
+    channels_ = srcshape[dim-3];
+  else
+    channels_=1;
+  batchsize_=srcshape[0];
+  pooled_height_ = static_cast<int>((height_ - kernel_) / stride_) + 1;
+  pooled_width_ = static_cast<int>(( width_ - kernel_) / stride_) + 1;
+  data_.Reshape(vector<int>{batchsize_, channels_, pooled_height_, pooled_width_});
+  grad_.ReshapeLike(data_);
+}
+
+void PoolingLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+
+void PoolingLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+      Shape4(batchsize_, channels_, height_, width_));
+  Tensor<cpu, 4> data(data_.mutable_cpu_data(),
+      Shape4(batchsize_, channels_, pooled_height_, pooled_width_));
+  if(pool_ == PoolingProto_PoolMethod_MAX)
+    data=pool<red::maximum>(src, kernel_, stride_);
+  else if(pool_ == PoolingProto_PoolMethod_AVE)
+    data=pool<red::sum>(src, kernel_, stride_)
+      *(1.0f/(kernel_*kernel_));
+}
+
+/*
+ * partition only on num/channel dim
+ * assume grad and data have the same paritition
+ */
+void PoolingLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  Shape<4> s1= Shape4(batchsize_, channels_, height_, width_);
+  Tensor<cpu, 4> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),s1);
+  Tensor<cpu, 4> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),s1);
+  Shape<4> s2= Shape4(batchsize_, channels_, pooled_height_, pooled_width_);
+  Tensor<cpu, 4> data(data_.mutable_cpu_data(), s2);
+  Tensor<cpu, 4> grad(grad_.mutable_cpu_data(), s2);
+  if(pool_ == PoolingProto_PoolMethod_MAX)
+      gsrc = unpool<red::maximum>(src, data, grad, kernel_, stride_);
+  else if(pool_ == PoolingProto_PoolMethod_AVE)
+      gsrc = unpool<red::sum>(src, data, grad, kernel_, stride_)
+        *(1.0f/(kernel_*kernel_));
+}
+
+/***************** Implementation for ReLULayer *****************************/
+
+void ReLULayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  data_.ReshapeLike(srclayers[0]->data());
+  grad_.ReshapeLike(*(srclayers[0]->mutable_grad()));
+}
+
+void ReLULayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+
+void ReLULayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  Tensor<cpu, 1> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+      Shape1(data_.count()));
+  data=F<op::relu>(src);
+}
+
+void ReLULayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(grad_.count()));
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  Tensor<cpu, 1> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
+      Shape1(data_.count()));
+  gsrc=F<op::relu_grad>(data)*grad;
+}
+
+/*************** Implementation for RGBImageLayer *************************/
+
+void RGBImageLayer::ParseRecords(bool training, const vector<Record>& records,
+    Blob<float>* blob){
+  LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
+  const vector<int>& s=blob->shape();
+  Tensor<cpu, 4> images(blob->mutable_cpu_data(), Shape4(s[0],s[1],s[2],s[3]));
+  const SingleLabelImageRecord& r=records.at(0).image();
+  Tensor<cpu, 3> raw_image(Shape3(r.shape(0),r.shape(1),r.shape(2)));
+  AllocSpace(raw_image);
+  Tensor<cpu, 3> croped_image(Shape3(s[1],s[2],s[3]));
+  if(cropsize_)
+    AllocSpace(croped_image);
+    //CHECK(std::equal(croped_image.shape(), raw_image.shape());
+  int rid=0;
+  const float* meandptr=mean_.cpu_data();
+  for(const Record& record: records){
+    auto image=images[rid];
+    bool do_crop=cropsize_>0&&training;
+    bool do_mirror=mirror_&&rand()%2&&training;
+    float* dptr=nullptr;
+    if(do_crop||do_mirror)
+      dptr=raw_image.dptr;
+    else
+      dptr=image.dptr;
+    if(record.image().pixel().size()){
+      string pixel=record.image().pixel();
+      for(size_t i=0;i<pixel.size();i++)
+        dptr[i]=static_cast<float>(static_cast<uint8_t>(pixel[i]));
+    }else {
+      memcpy(dptr, record.image().data().data(),
+          sizeof(float)*record.image().data_size());
+    }
+    for(int i=0;i<mean_.count();i++)
+      dptr[i]-=meandptr[i];
+
+    if(do_crop){
+      int hoff=rand()%(r.shape(1)-cropsize_);
+      int woff=rand()%(r.shape(2)-cropsize_);
+      Shape<2> cropshape=Shape2(cropsize_, cropsize_);
+      if(do_mirror){
+        croped_image=crop(raw_image, cropshape, hoff, woff);
+        image=mirror(croped_image);
+      }else{
+        image=crop(raw_image, cropshape, hoff, woff);
+      }
+    }else if(do_mirror){
+      image=mirror(raw_image);
+    }
+    rid++;
+  }
+  if(scale_)
+    images=images*scale_;
+
+  FreeSpace(raw_image);
+  if(cropsize_)
+    FreeSpace(croped_image);
+}
+void RGBImageLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),1);
+  scale_=proto.rgbimage_param().scale();
+  cropsize_=proto.rgbimage_param().cropsize();
+  mirror_=proto.rgbimage_param().mirror();
+  int batchsize=static_cast<DataLayer*>(srclayers[0].get())->batchsize();
+  Record sample=static_cast<DataLayer*>(srclayers[0].get())->sample();
+  vector<int> shape;
+  shape.push_back(batchsize);
+  for(int x: sample.image().shape())
+    shape.push_back(x);
+  CHECK_EQ(shape.size(),4);
+  if(cropsize_){
+    shape[2]=cropsize_;
+    shape[3]=cropsize_;
+  }
+  data_.Reshape(shape);
+  mean_.Reshape({shape[1],shape[2],shape[3]});
+  if(proto.rgbimage_param().has_meanfile()){
+    BlobProto tmp;
+    ReadProtoFromBinaryFile(proto.rgbimage_param().meanfile().c_str(), &tmp);
+    CHECK_EQ(mean_.count(), tmp.data_size());
+    memcpy(mean_.mutable_cpu_data(), tmp.data().data(), sizeof(float)*tmp.data_size());
+  }else{
+    memset(mean_.mutable_cpu_data(),0,sizeof(float)*mean_.count());
+  }
+}
+
+/***************Implementation for ShardDataLayer**************************/
+void ShardDataLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  if(random_skip_){
+    int nskip=rand()%random_skip_;
+    LOG(INFO)<<"Random Skip "<<nskip<<" records, there are "<<shard_->Count()
+      <<" records in total";
+    string key;
+    for(int i=0;i<nskip;i++){
+      shard_->Next(&key, &sample_);
+    }
+    random_skip_=0;
+  }
+  for(auto& record: records_){
+    string key;
+    shard_->Next(&key, &record);
+  }
+}
+
+void ShardDataLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  shard_= std::make_shared<DataShard>(proto.data_param().path(),
+      DataShard::kRead);
+  string key;
+  shard_->Next(&key, &sample_);
+  batchsize_=proto.data_param().batchsize();
+
+  records_.resize(batchsize_);
+  random_skip_=proto.data_param().random_skip();
+}
+/*******************Implementation of TanLayer***************************/
+void TanhLayer::Setup(const LayerProto& proto,
+      const vector<SLayer>& srclayers){
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
+}
+
+void TanhLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+
+
+void TanhLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers){
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  Tensor<cpu, 1> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+      Shape1(data_.count()));
+  data=F<op::stanh>(src);
+}
+
+void TanhLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  Tensor<cpu, 1> data(data_.mutable_cpu_data(), Shape1(data_.count()));
+  Tensor<cpu, 1> grad(grad_.mutable_cpu_data(), Shape1(grad_.count()));
+  Tensor<cpu, 1> gsrc(srclayers[0]->mutable_grad(this)->mutable_cpu_data(),
+      Shape1(data_.count()));
+  gsrc=F<op::stanh_grad>(data)*grad;
+}
+/********** * Implementation for SoftmaxLossLayer*************************/
+void SoftmaxLossLayer::Setup(const LayerProto& proto,
+    const vector<SLayer>& srclayers){
+  CHECK_EQ(srclayers.size(),2);
+  data_.Reshape(srclayers[0]->data(this).shape());
+  batchsize_=data_.shape()[0];
+  dim_=data_.count()/batchsize_;
+  topk_=proto.softmaxloss_param().topk();
+  metric_.Reshape(vector<int>{2});
+  scale_=proto.softmaxloss_param().scale();
+}
+void SoftmaxLossLayer::SetupAfterPartition(const LayerProto& proto,
+      const vector<int> &shape,
+      const vector<SLayer>& srclayers){
+  Setup(proto, srclayers);
+}
+void SoftmaxLossLayer::ComputeFeature(bool training, const vector<SLayer>& srclayers) {
+  Shape<2> s=Shape2(batchsize_, dim_);
+  Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
+  Tensor<cpu, 2> src(srclayers[0]->mutable_data()->mutable_cpu_data(), s);
+  Softmax(prob, src);
+  const float* label=srclayers[1]->data().cpu_data();
+  const float* probptr=prob.dptr;
+  float loss=0, precision=0;
+  for(int n=0;n<batchsize_;n++){
+    int ilabel=static_cast<int>(label[n]);
+    CHECK_LT(ilabel,10);
+    CHECK_GE(ilabel,0);
+    float prob_of_truth=probptr[ilabel];
+    loss-=log(std::max(prob_of_truth, FLT_MIN));
+    vector<std::pair<float, int> > probvec;
+    for (int j = 0; j < dim_; ++j) {
+      probvec.push_back(std::make_pair(probptr[j], j));
+    }
+    std::partial_sort(
+        probvec.begin(), probvec.begin() + topk_,
+        probvec.end(), std::greater<std::pair<float, int> >());
+    // check if true label is in top k predictions
+    for (int k = 0; k < topk_; k++) {
+      if (probvec[k].second == static_cast<int>(label[n])) {
+        precision++;
+        break;
+      }
+    }
+    probptr+=dim_;
+  }
+  CHECK_EQ(probptr, prob.dptr+prob.shape.Size());
+  float *metric=metric_.mutable_cpu_data();
+  metric[0]=loss*scale_/(1.0f*batchsize_);
+  metric[1]=precision*scale_/(1.0f*batchsize_);
+}
+
+void SoftmaxLossLayer::ComputeGradient(const vector<SLayer>& srclayers) {
+  const float* label=srclayers[1]->data().cpu_data();
+  Blob<float>* gsrcblob=srclayers[0]->mutable_grad();
+  gsrcblob->CopyFrom(data_);
+  float* gsrcptr=gsrcblob->mutable_cpu_data();
+  for(int n=0;n<batchsize_;n++){
+    gsrcptr[n*dim_+static_cast<int>(label[n])]-=1.0f;
+  }
+  Tensor<cpu, 1> gsrc(gsrcptr, Shape1(gsrcblob->count()));
+  gsrc*=scale_/(1.0f*batchsize_);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
new file mode 100644
index 0000000..0bca26e
--- /dev/null
+++ b/src/neuralnet/neuralnet.cc
@@ -0,0 +1,401 @@
+#include <algorithm>
+#include <queue>
+
+#include "neuralnet/neuralnet.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "utils/graph.h"
+
+
+namespace singa {
+#define CreateLayer(id) CreateInstance(id, Layer)
+
+void NeuralNet::RegisterLayers(){
+  Factory<Layer>* factory=Singleton<Factory<Layer>>::Instance();
+  factory->Register("kConvolution", CreateLayer(ConvolutionLayer));
+  factory->Register("kConcate", CreateLayer(ConcateLayer));
+  factory->Register("kDropout", CreateLayer(DropoutLayer));
+  factory->Register("kInnerProduct", CreateLayer(InnerProductLayer));
+  factory->Register("kRGBImage", CreateLayer(RGBImageLayer));
+  factory->Register("kLabel", CreateLayer(LabelLayer));
+  factory->Register("kLMDBData", CreateLayer(LMDBDataLayer));
+  factory->Register("kLRN", CreateLayer(LRNLayer));
+  factory->Register("kMnistImage", CreateLayer(MnistImageLayer));
+  factory->Register("kBridgeDst", CreateLayer(BridgeDstLayer));
+  factory->Register("kBridgeSrc", CreateLayer(BridgeSrcLayer));
+  factory->Register("kPooling", CreateLayer(PoolingLayer));
+  factory->Register("kReLU", CreateLayer(ReLULayer));
+  factory->Register("kShardData", CreateLayer(ShardDataLayer));
+  factory->Register("kSlice", CreateLayer(SliceLayer));
+  factory->Register("kSoftmaxLoss", CreateLayer(SoftmaxLossLayer));
+  factory->Register("kSplit", CreateLayer(SplitLayer));
+  factory->Register("kTanh", CreateLayer(TanhLayer));
+}
+shared_ptr<NeuralNet> NeuralNet::SetupNeuralNet(const NetProto& np, Phase phase){
+  NetProto proto;
+  proto.set_partition_type(np.partition_type());
+  // exclude layers if necessary
+  for(auto& layer:np.layer()){
+    bool include=true;
+    for(int x: layer.exclude()){
+      if(x==phase)
+        include=false;
+    }
+    if(include){
+      LayerProto* lp=proto.add_layer();
+      lp->CopyFrom(layer);
+    }
+  }
+  LOG(INFO)<<"NeuralNet config is "<<proto.DebugString();
+  shared_ptr<NeuralNet> net(new NeuralNet(proto));
+  return net;
+}
+NeuralNet::NeuralNet(NetProto net_proto, int group_size) {
+  group_size_=group_size;
+  for(int i=0;i<net_proto.layer_size();i++){
+    LayerProto * layer_proto=net_proto.mutable_layer(i);
+    if(!layer_proto->has_partition_type())
+      layer_proto->set_partition_type(net_proto.partition_type());
+  }
+
+  LOG(INFO)<<"Construct Neural Net...";
+  ConstructNeuralNet(net_proto);
+  if(group_size_>1)
+    PartitionNeuralNet();
+  for(auto layer: layers_){
+    DLOG(INFO)<<layer->name();
+  }
+  // assign id for params;
+  int paramid=0;
+  for(auto& layer: layers_){
+    for(shared_ptr<Param> p: layer->GetParams()){
+      params_.push_back(p);
+      p->set_id(paramid++);
+    }
+  }
+
+  LOG(INFO)<<"Neural Net constructed";
+}
+
+void NeuralNet::ConstructNeuralNet(const NetProto& net_proto){
+  // construct graph, one node for one layer, identified by layer name
+  map<string, LayerProto> protos;
+  for (auto &layer_proto : net_proto.layer()){
+    graph_.AddNode(layer_proto.name());
+    protos[layer_proto.name()]=layer_proto;
+  }
+  for (auto &layer_proto : net_proto.layer())
+    if(layer_proto.srclayers_size())
+      for(const string& src: layer_proto.srclayers())
+        graph_.AddEdge(src, layer_proto.name());
+
+  // topology sort
+  graph_.Sort();
+  //DLOG(INFO)<<"pure graph without partition\n"<< graph_.ToString();
+
+  auto* factory=Singleton<Factory<Layer>>::Instance();
+  // create Layers according to topology order
+  for(SNode node: graph_.nodes()){
+    shared_ptr<Layer> layer(factory->Create(protos[node->name()].type()));
+    layer->Init(protos[node->name()]);
+    name2layer_[node->name()]=layer;
+    layers_.push_back(layer);
+  }
+
+  // connect Layers.
+  for(SNode node: graph_.nodes()){
+    auto layer=name2layer_[node->name()];
+    for(SNode dst: node->dstnodes())
+      layer->AddDstLayer(name2layer_[dst->name()]);
+    for(SNode src: node->srcnodes())
+      layer->AddSrcLayer(name2layer_[src->name()]);
+  }
+  // setup layer properties, e.g., shapes
+  for(auto& layer: layers_){
+      layer->Setup();
+  }
+  LOG(INFO)<<"network graph witout partition\n"<<ToString();
+}
+
+void NeuralNet::PartitionNeuralNet(){
+  graph_=CreatePartitonedGraph(layers_, name2layer_);
+  //DLOG(ERROR)<<"pure graph after partition\n"<<graph_.ToString();
+  map<string, shared_ptr<Layer>> name2layer(name2layer_);
+  name2layer_.clear();
+  layers_.clear();
+  int gsize=group_size_;
+  auto* factory=Singleton<Factory<Layer>>::Instance();
+  // create Layers according to topology order
+  for(SNode node: graph_.nodes()){
+    LayerProto proto;
+    proto.set_name(node->name());
+    proto.set_locationid(node->val().locationid);
+    proto.set_partitionid(node->val().partitionid);
+    const string& origin=node->val().origin;
+    if (origin=="kSlice"){
+      proto.set_type(origin);
+      SliceProto *slice=proto.mutable_slice_param();
+      slice->set_slice_dimension(node->val().slice_dimension);
+      slice->set_slice_num(node->dstnodes().size());
+    }else if(origin== "kConcate"){
+      proto.set_type(origin);
+      ConcateProto *concate=proto.mutable_concate_param();
+      concate->set_concate_dimension(node->val().concate_dimension);
+      concate->set_concate_num(node->srcnodes().size());
+    }else if(origin=="kSplit"){
+      proto.set_type(origin);
+      SplitProto *split=proto.mutable_split_param();
+      split->set_num_splits(node->dstnodes().size());
+    }else if(origin=="kBridgeSrc" || origin== "kBridgeDst"){
+      proto.set_type(origin);
+    }else{
+      CHECK(name2layer.find(node->val().origin)!=name2layer_.end())
+        <<"Unkown origin for node "<<node->val().origin;
+    }
+    shared_ptr<Layer> newlayer;
+    if(proto.has_type()){
+      // layers added due to partition
+      shared_ptr<Layer> layer(factory->Create(proto.type()));
+      layer->Init(proto);
+      newlayer=layer;
+    }else{
+      // partitioned layers from origin neuralnet
+      auto oldlayer=name2layer.at(node->val().origin);
+      vector<int> shape=oldlayer->shape(nullptr);
+      if(oldlayer->partition_type()==kNone){
+        newlayer=oldlayer;
+      } else{
+        int pdim=oldlayer->partition_dimension();
+        shape[pdim]=shape[pdim]/gsize+
+          ((node->val().partitionid==gsize-1)?shape[pdim]%gsize:0);
+        shared_ptr<Layer> layer(factory->Create(oldlayer->type()));
+        layer->Init(*oldlayer, shape);
+        layer->set_name(node->name());
+        newlayer=layer;
+      }
+    }
+    layers_.push_back(newlayer);
+    name2layer_[node->name()]=newlayer;
+  }
+
+  // connect Layers.
+  for(SNode node: graph_.nodes()){
+    auto layer=name2layer_[node->name()];
+    layer->ClearDstLayers();
+    for(SNode dst: node->dstnodes())
+      layer->AddDstLayer(name2layer_[dst->name()]);
+    layer->ClearSrcLayers();
+    for(SNode src: node->srcnodes())
+      layer->AddSrcLayer(name2layer_[src->name()]);
+  }
+
+  LOG(INFO)<<"Adjacency matrix\n"<<ToAdjacency();
+
+  // set up layers after
+  for(shared_ptr<Layer> layer: layers_){
+    const vector<int>& shape=layer->shape(nullptr);
+    layer->SetupAfterPartition();
+    const vector<int>& newshape=layer->shape(nullptr);
+    if(shape.size())
+      CHECK(std::equal(shape.begin(),shape.end(),newshape.begin()));
+  }
+
+  LOG(INFO)<<"network graph after partition layers\n"<<ToString();
+}
+
+Graph NeuralNet::CreatePartitonedGraph(const vector<shared_ptr<Layer>>& layers,
+    const map<string, shared_ptr<Layer>>& name2layer){
+  Graph graph;
+  // partition origin nodes/layers
+  map<string, vector<SNode>> layer2nodes; //from name of original layer to nodes
+  int gsize=group_size_;
+  for(const auto& layer: layers){
+    vector<SNode> nodes;
+    if(layer->partition_type()==kDataPartition||
+        layer->partition_type()==kLayerPartition){
+      char suffix[4];
+      for(int i=0;i<gsize;i++){
+        sprintf(suffix, "%02d", i);
+        // differentiate partitions
+        string nodename=layer->name()+"-"+string(suffix);
+        LayerInfo info;
+        auto node=graph.AddNode(nodename, LayerInfo{layer->name(),i, i,-1,-1});
+        nodes.push_back(node);
+      }
+    }else if(layer->partition_type()==kNone){
+      auto node=graph.AddNode(layer->name(),
+          LayerInfo{layer->name(), layer->locationid(), 0,-1,-1});
+      nodes.push_back(node);
+    }else{
+      LOG(FATAL)<<"Unknown partition type "<<layer->partition_type();
+    }
+    layer2nodes[layer->name()]=nodes;
+  }
+
+
+  // connect nodes, nodes for ConcateLayer and SliceLayer are added.
+  for(shared_ptr<Layer> layer: layers){
+    string name=layer->name();
+    PartitionType type=layer->partition_type();
+    const vector<SNode>& nodes=layer2nodes.at(name);
+    for(int srcid=0;srcid<layer->srclayers_size();srcid++){
+      shared_ptr<Layer> srclayer=layer->srclayers()[srcid];
+      string srcname=srclayer->name();
+      const vector<SNode> srcnodes=layer2nodes.at(srcname);
+      PartitionType srctype=srclayer->partition_type();
+      ConnectionType connection=layer->connection_type(srcid);
+      if(srctype==kNone){
+        CHECK_EQ(srcnodes.size(),1)
+          <<"local layer "<<srcname<<" should not be partitioned";
+        SNode srcnode=srcnodes[0];
+        if(type==kDataPartition||(type==kLayerPartition&&connection==kOneToOne)){
+          LayerInfo info=srcnode->val();
+          info.slice_dimension=name2layer.at(name)->partition_dimension();
+          graph.InsertSliceNode(srcnode, nodes, info);
+        } else if(type==kNone){
+          CHECK_EQ(nodes.size(),1)
+            <<"local layer "<<name<<" should not be nodeed";
+          graph.AddEdge(srcnode, nodes[0]);
+        } else { // type==kLayerPartition&&connection==kOneToAll
+          graph.InsertSplitNode(srcnode, nodes);
+        }
+      }else if((type==kNone
+                &&(srctype==kDataPartition||srctype==kLayerPartition))
+               ||(type==kLayerPartition&&connection==kOneToAll&&
+                  (srctype==kDataPartition||srctype==kLayerPartition))){
+        // copy/concate the whole srclayer for every dst partition
+        for(SNode node:nodes){
+          LayerInfo info=node->val();
+          info.concate_dimension=name2layer.at(srcname)->partition_dimension();
+          CHECK_GE(info.concate_dimension,0);
+          graph.InsertConcateNode(srcnodes, node, info);
+        }
+      }else if((srctype==kLayerPartition&&type==kDataPartition)
+          || (srctype==kDataPartition&&type==kLayerPartition)){
+        // the most complext scenario
+        vector<SNode> slicenodes;
+        for(SNode srcnode: srcnodes){
+          LayerInfo info=srcnode->val();
+          info.slice_dimension=name2layer.at(name)->partition_dimension();
+          slicenodes.push_back(graph.InsertSliceNode(srcnode, nodes,
+              info, false));
+        }
+        for(SNode node: nodes){
+          LayerInfo info=node->val();
+          info.concate_dimension=name2layer.at(srcname)->partition_dimension();
+          CHECK_GE(info.concate_dimension,0);
+          graph.InsertConcateNode(slicenodes, node, info);
+        }
+      }else if((srctype==kDataPartition&&type==kDataPartition)||
+          (srctype==kLayerPartition&&type==kLayerPartition&&
+           layer->connection_type(srcid)==kOneToOne)){
+        CHECK_EQ(srcnodes.size(), nodes.size());
+        for(size_t i=0;i<srcnodes.size();i++){
+          graph.AddEdge(srcnodes[i], nodes[i]);
+        }
+      }
+    }
+  }
+  // must do topology sort, because we have added new nodes.
+  graph.Sort();
+  //LOG(ERROR)<<graph.ToString();
+
+  // add node for split layer
+  bool data_node=true;
+  vector<SNode> oldnodes=graph.nodes();
+  for(SNode node: oldnodes){
+    if(node->dstnodes_size()>1&&node->val().origin!="kSlice"
+        &&node->val().origin!="kSplit"&&!data_node){
+      vector<SNode> dstnodes=node->dstnodes();
+      for(SNode dst: dstnodes)
+        graph.RemoveEdge(node, dst);
+      graph.InsertSplitNode(node, dstnodes);
+    }
+    data_node=false;
+  }
+
+  // add bridge
+  oldnodes=graph.nodes();
+  for(SNode node: oldnodes){
+    vector<SNode> dstnodes=node->dstnodes();
+    for(size_t i=0;i<dstnodes.size();i++){
+      SNode dstnode=dstnodes.at(i);
+      if(node->val().locationid!=dstnode->val().locationid){
+        graph.RemoveEdge(node, dstnode);
+        graph.InsertBridgeNode(node, dstnode);
+      }
+    }
+  }
+  graph.Sort();
+  return graph;
+}
+
+std::string NeuralNet::ToString(){
+  map<string, string> info;
+  for(auto layer: layers_){
+    info[layer->name()]=IntVecToString(layer->shape(nullptr));
+    string type=layer->type();
+  }
+  return graph_.ToString(info);
+}
+
+std::string NeuralNet::ToAdjacency(){
+  string disp="";
+  for(auto& layer: layers_){
+    disp+=layer->name()+": ";
+    for(const auto& dst: layer->dstlayers())
+      disp+=dst->name()+", ";
+    disp+="\n";
+  }
+  return disp;
+}
+
+
+void NeuralNet::ToProto(NetProto *proto, bool copyData) {
+  proto->clear_layer();
+}
+
+string NeuralNet::DebugInfo(){
+  string ret;
+  char display[4096];
+  for(auto& layer: layers_){
+    if(!layer->is_datalayer()){
+      sprintf(display, "Forward layer  %10s data norm1 %13.9f\n",
+          layer->name().c_str(), layer->data().asum_data());
+      ret+=string(display);
+    }
+  }
+  for (auto it = layers_.rbegin(); it != layers_.rend(); it++){
+    shared_ptr<Layer> layer=*it;
+    if(!(layer->is_datalayer()||layer->is_losslayer()||layer->is_parserlayer())){
+      sprintf(display, "Backward layer %10s grad norm1 %13.9f\n",
+          layer->name().c_str(), layer->grad().asum_data());
+      ret+=string(display);
+    }
+  }
+  for(auto& layer: layers_){
+    for(auto param: layer->GetParams()){
+      sprintf(display, "Layer %10s, param id %2d, name %10s,\
+          value norm1 %13.9f, grad norm1 %13.9f\n",
+          layer->name().c_str(), param->id(), param->name().c_str(),
+          param->data().asum_data(), param->grad().asum_data());
+      ret+=string(display);
+    }
+  }
+  return ret;
+}
+void NeuralNet::ShareParams(shared_ptr<NeuralNet> other, int flag){
+  for(auto& layer: layers_){
+    auto otherlayer=other->name2layer(layer->name());
+    if(otherlayer!=nullptr){
+      const auto& otherparams=otherlayer->GetParams();
+      const auto& params=layer->GetParams();
+      CHECK_EQ(params.size(), otherparams.size());
+      for(size_t i=0;i<params.size();i++){
+        params[i]->ShareData(otherparams[i]);
+      }
+    }
+  }
+}
+
+}  // namespace singa


[11/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest-all.cc
----------------------------------------------------------------------
diff --git a/include/gtest/gtest-all.cc b/include/gtest/gtest-all.cc
new file mode 100644
index 0000000..a9a03b2
--- /dev/null
+++ b/include/gtest/gtest-all.cc
@@ -0,0 +1,9592 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// A user is trying to include this from his code - just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+# if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+class AutoHandle {
+ public:
+  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
+
+  ~AutoHandle() { Reset(); }
+
+  HANDLE Get() const { return handle_; }
+  void Reset() { Reset(INVALID_HANDLE_VALUE); }
+  void Reset(HANDLE handle) {
+    if (handle != handle_) {
+      if (handle_ != INVALID_HANDLE_VALUE)
+        ::CloseHandle(handle_);
+      handle_ = handle;
+    }
+  }
+
+ private:
+  HANDLE handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+# endif  // GTEST_OS_WINDOWS
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+GTEST_API_ int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+std::string g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return "";
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.H

<TRUNCATED>

[02/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_table_server.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_table_server.cc b/src/test/dist_test/test_table_server.cc
new file mode 100644
index 0000000..5f3612c
--- /dev/null
+++ b/src/test/dist_test/test_table_server.cc
@@ -0,0 +1,357 @@
+//  Copyright © 2014 Anh Dinh. All Rights Reserved.
+
+#include "core/global-table.h"
+#include "core/common.h"
+#include "core/table.h"
+#include "core/table_server.h"
+#include "utils/global_context.h"
+#include "utils/common.h"
+#include <gflags/gflags.h>
+#include "proto/model.pb.h"
+#include "proto/common.pb.h"
+#include "worker.h"
+#include "coordinator.h"
+#include "utils/common.h"
+#include "utils/proto_helper.h"
+
+#include <cmath>
+#include <stdlib.h>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+
+/**
+ * Test for table server access. The table is of type <VKey,int>
+ */
+DEFINE_bool(restore_mode, false, "restore from checkpoint file");
+using namespace lapis;
+using std::vector;
+
+DEFINE_int32(checkpoint_frequency, 5000, "frequency for cp");
+DEFINE_int32(checkpoint_after, 1, "cp after this steps");
+DEFINE_string(par_mode, "hybrid",  "time training algorithm");
+DEFINE_bool(restore, false, "restore from checkpoint file");
+
+DEFINE_string(db_backend, "lmdb", "backend db");
+DEFINE_string(system_conf, "examples/imagenet12/system.conf", "configuration file for node roles");
+DEFINE_string(model_conf, "examples/imagenet12/model.conf", "DL model configuration file");
+DEFINE_string(checkpoint_dir,"/data1/wangwei/lapis/","check point dir");
+DEFINE_int32(threshold,1000000, "max # of parameters in a vector");
+DEFINE_int32(iterations,5,"numer of get/put iterations");
+DEFINE_int32(workers,2,"numer of workers doing get/put");
+DECLARE_bool(checkpoint_enabled);
+
+
+DECLARE_bool(checkpoint_enabled);
+
+/**
+ * Get and update handler for VKey.
+ */
+struct AnhUpdateHandler: BaseUpdateHandler<VKey, SGDValue> {
+	bool Update(SGDValue *a, const SGDValue &b) {
+
+		float * adptr = a->mutable_data()->mutable_value()->mutable_data();
+		const float*bdptr = b.grad(0).value().data();
+		for (int i = 0; i < b.grad(0).value_size(); i++)
+			adptr[i] += bdptr[i];
+
+		return true;
+	}
+
+	bool Get(const VKey k, const SGDValue &val, SGDValue *ret) {
+		*ret = val;
+		return true;
+	}
+
+	bool is_checkpointable(const VKey k, const SGDValue v) {
+		return false; //always checkpoint
+	}
+};
+
+typedef map<int, GlobalTable*> Map;
+Map tables;
+shared_ptr<NetworkThread> network;
+shared_ptr<GlobalContext> context;
+std::vector<ServerState*> server_states;
+TableServer *table_server;
+
+#define SIZE 16
+int tuple_sizes[SIZE] = {27448736, 16777216, 4096000, 1327104, 884736, 884736, 614400,14112,4096,4096,1000,384,384,256,256,96};
+
+/**
+ * Initialize tables.
+ */
+void create_mem_table(int id, int num_shards){
+
+	TableDescriptor *info = new TableDescriptor(id, num_shards);
+	  info->key_marshal = new Marshal<VKey>();
+	  info->value_marshal = new Marshal<SGDValue>();
+	  info->sharder = new VKeySharder;
+	  info->accum = new AnhUpdateHandler;
+	  info->partition_factory = new typename SparseTable<VKey, SGDValue>::Factory;
+	  auto table=new TypedGlobalTable<VKey, SGDValue>();
+	  table->Init(info);
+	  tables[id] = table;
+}
+
+/**
+ * Coordinator assigns shards to processes.
+ * @param id table ID.
+ */
+void coordinator_assign_tables(int id) {
+
+	// wait for the servers to be up.
+	for (int i = 0; i < context->num_procs(); i++) {
+		RegisterWorkerRequest req;
+		int src = 0;
+		//  adding memory server.
+		if (context->IsTableServer(i)) {
+			VLOG(3)<< "Waiting for message from table server " << i;
+			network->Read(MPI::ANY_SOURCE, MTYPE_REGISTER_WORKER, &req, &src);
+			server_states.push_back(new ServerState(i));
+		}
+	}
+
+	VLOG(3) << " All servers registered and started up. Ready to go";
+	VLOG(3) << "num of shards" << tables[id]->num_shards() << " for table "	<< id;
+
+	// assign table to shard in round roubin fashion.
+	int server_idx = 0;
+	for (int shard = 0; shard < tables[id]->num_shards(); ++shard) {
+		ServerState &server = *server_states[server_idx];
+		VLOG(3) << "Assigning table (" << id << "," << shard << ") to server "
+				<< server_states[server_idx]->server_id;
+		server.shard_id = shard;
+		server.local_shards.insert(new TaskId(id, shard));
+		server_idx = (server_idx + 1) % server_states.size();
+	}
+	ShardAssignmentRequest req;
+	for (size_t i = 0; i < server_states.size(); ++i) {
+		ServerState &server = *server_states[i];
+		for (auto * task : server.local_shards) {
+			ShardAssignment *s = req.add_assign();
+			s->set_new_worker(server.server_id);
+			s->set_table(task->table);
+			s->set_shard(task->shard);
+			//  update local tables
+			GlobalTable *t = tables.at(task->table);
+			t->get_partition_info(task->shard)->owner = server.server_id;
+			delete task;
+		}
+	}
+
+	network->SyncBroadcast(MTYPE_SHARD_ASSIGNMENT, MTYPE_SHARD_ASSIGNMENT_DONE,
+			req);
+	VLOG(3) << "done table assignment... ";
+}
+
+
+void table_init(){
+	table_server = new TableServer();
+	table_server->StartTableServer(tables);
+	VLOG(3) << "table server started on process "<< NetworkThread::Get()->id();
+}
+
+
+/**
+ * Coordinator loads data to the table.
+ * @param size number of tuples.
+ */
+void coordinator_load_data() {
+	auto table = static_cast<TypedGlobalTable<VKey, SGDValue>*>(tables[0]);
+	for (int i = 0; i < SIZE; i++) {
+		VKey key;
+		SGDValue x;
+		DAryProto *data = x.mutable_data();
+		DAryProto *grad = x.add_grad();
+		for (int j = 0; j < tuple_sizes[i]; j++) {
+			data->add_value(j * 1.0f);
+			grad->add_value(j * 1.0f);
+		}
+		key.set_key(i);
+		table->put(key, x);
+	}
+	VLOG(3) << "Done loading " << SIZE << " tuples ...";
+}
+
+/**
+ * Worker gets tuples from the server.
+ * @param size number of tuples to be requested.
+ */
+void get() {
+	auto table = static_cast<TypedGlobalTable<VKey,SGDValue>*>(tables[0]);
+	SGDValue value;
+	for (int i = 0; i < SIZE; i++) {
+		VKey key;
+		key.set_key(i);
+		table->async_get(key, &value);
+	}
+	VLOG(3) << "Done sending get requests ...";
+
+	for (int i = 0; i < SIZE; i++) {
+		VKey key;
+		while (!table->async_get_collect(&key, &value))
+			Sleep(0.0001);
+	}
+}
+
+/**
+ * Worker updates tuples.
+ */
+void update() {
+	auto table = static_cast<TypedGlobalTable<VKey, SGDValue>*>(tables[0]);
+	for (int i = 0; i < SIZE; i++) {
+		VKey key;
+		key.set_key(i);
+
+		SGDValue x;
+		DAryProto *grad = x.add_grad();
+		for (int j = 0; j < tuple_sizes[i]; j++)
+			grad->add_value(j * 1.0f);
+
+		table->update(key, x);
+	}
+	VLOG(3) << "Done updating " << SIZE << " tuples ...";
+}
+
+
+void worker_test_data() {
+	//get(size);
+	update();
+	update();
+	get();
+	/*
+	update(table, tuples);
+	update(table, tuples);
+	update(table, tuples);
+	get(table, tuples);
+	*/
+}
+
+/**
+ * Shutdown the process.
+ */
+void shutdown() {
+	if (context->AmICoordinator()) {
+		EmptyMessage msg;
+		for (int i = 0; i < context->num_procs() - 1; i++)
+			network->Read(MPI::ANY_SOURCE, MTYPE_WORKER_END, &msg);
+		EmptyMessage shutdown_msg;
+		for (int i = 0; i < network->size() - 1; i++) {
+			network->Send(i, MTYPE_SHUTDOWN, shutdown_msg);
+		}
+		//network->Flush();
+		network->Shutdown();
+	} else {
+		//network->Flush();
+		network->Send(context->num_procs() - 1, MTYPE_WORKER_END,
+				EmptyMessage());
+		EmptyMessage msg;
+		network->Read(context->num_procs() - 1, MTYPE_SHUTDOWN, &msg);
+
+		if (context->AmITableServer()){
+			RequestDispatcher::Get()->PrintStats();
+			table_server->ShutdownTableServer();
+		}
+
+		network->Shutdown();
+	}
+}
+
+/**
+ * Worker handle shard assignment from the coordinator.
+ */
+void HandleShardAssignment() {
+
+	ShardAssignmentRequest shard_req;
+	auto mpi = NetworkThread::Get();
+	mpi->Read(GlobalContext::kCoordinator, MTYPE_SHARD_ASSIGNMENT, &shard_req);
+
+	//  request read from coordinator
+	for (int i = 0; i < shard_req.assign_size(); i++) {
+		const ShardAssignment &a = shard_req.assign(i);
+		GlobalTable *t = tables.at(a.table());
+		t->get_partition_info(a.shard())->owner = a.new_worker();
+
+		//if local shard, create check-point files
+		if (FLAGS_checkpoint_enabled && t->is_local_shard(a.shard())) {
+			string checkpoint_file = StringPrintf("%s/checkpoint_%d",
+					FLAGS_checkpoint_dir.c_str(), a.shard());
+			char hostname[256];
+			gethostname(hostname, sizeof(hostname));
+
+			FILE *tmp_file = fopen(checkpoint_file.c_str(), "r");
+			if (tmp_file) { //exists -> open to reading and writing
+				fclose(tmp_file);
+				auto cp = t->checkpoint_files();
+
+				if (FLAGS_restore_mode) { //open in read mode to restore, then close
+					LogFile *file = new LogFile(checkpoint_file, "rw", 0);
+					int table_size = file->read_latest_table_size();
+					delete file;
+
+					double start = Now();
+					(*cp)[a.shard()] = new LogFile(checkpoint_file, "r",
+							a.shard());
+					t->Restore(a.shard());
+					delete (*cp)[a.shard()];
+					double end = Now();
+					LOG(ERROR) << "restore time\t" << end - start << "\tfor\t"
+							<< table_size << "\tthreshold\t" << FLAGS_threshold;
+				}
+				char hostname[256];
+				gethostname(hostname, sizeof(hostname));
+				(*cp)[a.shard()] = new LogFile(checkpoint_file, "a", a.shard());
+			} else { // not exist -> open to writing first time
+				auto cp = t->checkpoint_files();
+				(*cp)[a.shard()] = new LogFile(checkpoint_file, "w", a.shard());
+			}
+		}
+	}
+
+	EmptyMessage empty;
+	mpi->Send(GlobalContext::kCoordinator, MTYPE_SHARD_ASSIGNMENT_DONE, empty);
+	VLOG(3) << "Done handling shard assignment ...";
+
+}
+
+
+int main(int argc, char **argv) {
+	FLAGS_logtostderr = 1;
+	int provided;
+	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+	context = GlobalContext::Get(FLAGS_system_conf);
+	network = NetworkThread::Get();
+
+	ModelProto model;
+	ReadProtoFromTextFile(FLAGS_model_conf.c_str(), &model);
+
+	create_mem_table(0, context->num_table_servers());
+
+	if (context->AmICoordinator()) {
+		coordinator_assign_tables(0);
+		coordinator_load_data();
+		network->barrier();
+	} else {
+		if (context->AmITableServer()) {
+			table_init();
+			HandleShardAssignment();
+			network->barrier();
+		} else {
+			HandleShardAssignment();
+			network->barrier();
+			Sleep(1);
+			VLOG(3) << "Worker cleared the barrier ...";
+			worker_test_data();
+		}
+	}
+
+	shutdown();
+	return 0;
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/dist_test/test_tuple.cc
----------------------------------------------------------------------
diff --git a/src/test/dist_test/test_tuple.cc b/src/test/dist_test/test_tuple.cc
new file mode 100644
index 0000000..727f8e3
--- /dev/null
+++ b/src/test/dist_test/test_tuple.cc
@@ -0,0 +1,258 @@
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "server.h"
+#include "proto/worker.pb.h"
+#include "utils/network_service.h"
+#include "core/common.h"
+#include "core/network_queue.h"
+#include "proto/model.pb.h"
+#include "proto/common.pb.h"
+#include "utils/global_context.h"
+
+/**
+ * @file test_tuple.cc
+ *
+ * Test performance of TableServer put/get/update operations.
+ */
+DECLARE_double(sleep_time);
+
+using namespace lapis;
+using namespace std;
+using std::vector;
+
+#define NKEYS 1000
+#define TUPLE_SIZE 50000000
+
+#ifndef FLAGS_v
+  DEFINE_int32(v, 3, "vlog controller");
+#endif
+
+
+#define SIZE 16
+#define THRESHOLD 500000
+int tuple_sizes[SIZE] = {37448736, 16777216, 4096000, 1327104, 884736, 884736, 614400,14112,4096,4096,1000,384,384,256,256,96};
+vector<int> valsizes;
+int collect_size;
+int num_tuples;
+
+void Put(int tid, int size, int version) {
+	RequestBase request;
+	request.set_table(0);
+	request.set_source(NetworkService::Get()->id());
+	PutRequest *put_req = request.MutableExtension(PutRequest::name);
+	int shard = tid % GlobalContext::Get()->num_servers();
+	put_req->set_shard(shard);
+	TableData *tuple = put_req->mutable_data();
+
+	TKey* key = tuple->mutable_key();
+	TVal* val = tuple->mutable_value();
+
+	key->set_id(tid);
+	key->set_version(version);
+
+	DAryProto *data = val->mutable_data();
+	for (int i = 0; i < size; i++){
+		data->add_value(0.0f);
+	}
+
+	// TODO check the msg type
+	NetworkService::Get()->Send(shard, MTYPE_REQUEST, request);
+}
+
+void Update(int tid, int size, int version) {
+	RequestBase request;
+	request.set_table(0);
+	request.set_source(NetworkService::Get()->id());
+	UpdateRequest *update_req = request.MutableExtension(UpdateRequest::name);
+	int shard = tid % GlobalContext::Get()->num_servers();
+	update_req->set_shard(shard);
+	TableData *tuple = update_req->mutable_data();
+
+	TKey* key = tuple->mutable_key();
+	TVal* val = tuple->mutable_value();
+
+	key->set_id(tid);
+	key->set_version(version);
+
+	DAryProto *data = val->mutable_grad();
+	for (int i = 0; i < size; i++)
+		data->add_value(1.0f);
+	// TODO check the msg type
+	NetworkService::Get()->Send(shard, MTYPE_REQUEST, request);
+}
+
+void print_result(TableData *data){
+	TKey *key = data->mutable_key();
+	TVal *val = data->mutable_value();
+	int k = key->id();
+	VLOG(3) << "key = " << k;
+	string s;
+	for (int i=0; i<TUPLE_SIZE; i++)
+		s.append(to_string(val->mutable_data()->value(i))).append(" ");
+	VLOG(3) << "val = " <<s;
+}
+
+void AsyncGet(int tid, int version) {
+	RequestBase request;
+	request.set_table(0);
+	request.set_source(GlobalContext::Get()->rank()); //NetworkService::Get()->id());
+	GetRequest *get_req = request.MutableExtension(GetRequest::name);
+	int shard = tid % GlobalContext::Get()->num_servers();
+	get_req->set_shard(shard);
+
+	TKey *key = get_req->mutable_key();
+	key->set_id(tid);
+	key->set_version(version);
+	NetworkService::Get()->Send(shard, MTYPE_REQUEST, request);
+
+}
+
+void Collect(){
+	int count = collect_size;
+	double start_collect = Now();
+	while (count){
+		while (true) {
+				Message *resp = NetworkService::Get()->Receive();
+				if (!resp)
+					Sleep(FLAGS_sleep_time);
+				else{
+					delete resp;
+					break;
+				}
+			}
+		count--;
+	}
+	double end_collect = Now();
+	VLOG(3) << "Collected " << collect_size << " tuples in " << (end_collect-start_collect);
+}
+
+/**
+ * Workers wait for the barrier, then one of them send SHUTDOWN message
+ * to all table servers.
+ */
+void worker_send_shutdown(int id){
+	auto gc = lapis::GlobalContext::Get();
+	NetworkService *network_service_ = NetworkService::Get().get();
+	MPI_Barrier(gc->workergroup_comm());
+	if (gc->rank()==id){
+		for (int i=0; i<gc->num_procs(); i++){
+			if (gc->IsTableServer(i)){
+				EmptyMessage msg;
+				network_service_->Send(i, MTYPE_SHUTDOWN,msg);
+			}
+		}
+	}
+}
+
+/**
+ * One worker with the specific ID puts, others wait.
+ */
+void worker_load_data(int id){
+	auto gc = lapis::GlobalContext::Get();
+	for (int i = 0; i < SIZE; i++) {
+		int m = tuple_sizes[i];
+		if (m < THRESHOLD)
+			valsizes.push_back(m);
+		else {
+			for (int j = 0; j < m / THRESHOLD; j++)
+				valsizes.push_back(THRESHOLD);
+			if (m % THRESHOLD)
+				valsizes.push_back(m%THRESHOLD);
+		}
+	}
+	num_tuples = (int)valsizes.size();
+	collect_size = 0;
+	for (int i=0; i<num_tuples; i++)
+		if (i%gc->group_size()==gc->worker_id())
+			collect_size++;
+
+	if (gc->rank()==id){
+		for (size_t i=0; i<valsizes.size(); i++)
+			Put(i,valsizes[i],0);
+		VLOG(3) << "Done loading data, num_keys = "<<valsizes.size() << " process " << id;
+	}
+	VLOG(3) << "Collect size = " << collect_size;
+	MPI_Barrier(gc->workergroup_comm());
+}
+
+void worker_update_data() {
+	auto gc = lapis::GlobalContext::Get();
+	for (int i = 0; i < num_tuples; i++)
+		if (i%gc->group_size()==gc->worker_id())
+			Update(i,valsizes[i],0);
+
+	VLOG(3) << "Done update ... for "<<collect_size << " tuples ";
+}
+
+/*
+ * Async get.
+ */
+void worker_get_data(){
+	auto gc = lapis::GlobalContext::Get();
+	for (int i=0; i<num_tuples; i++)
+		if (i%gc->group_size()==gc->worker_id())
+			AsyncGet(i,0);
+	Collect();
+	VLOG(3) << "Done collect ...";
+}
+
+void start_network_service_for_worker(){
+	NetworkService *network_service_ = NetworkService::Get().get();
+	network_service_->Init(GlobalContext::Get()->rank(), Network::Get().get(), new SimpleQueue());
+	network_service_->StartNetworkService();
+}
+
+int main(int argc, char **argv) {
+	google::InitGoogleLogging(argv[0]);
+	gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+	int provided;
+
+
+	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+
+
+	FLAGS_logtostderr = 1;
+
+
+	// Init GlobalContext
+	Cluster cluster;
+	cluster.set_server_start(0);
+	cluster.set_server_end(8);
+	cluster.set_worker_start(8);
+	cluster.set_worker_end(24);
+	cluster.set_group_size(8);
+	cluster.set_data_folder("/data1/wangwei/lapis");
+
+	auto gc = lapis::GlobalContext::Get(cluster);
+
+	// worker or table server
+	if (gc->AmITableServer()) {
+		lapis::TableServer server;
+		SGDProto sgd;
+		sgd.set_learning_rate(0.01);
+		sgd.set_momentum(0.9);
+		sgd.set_weight_decay(0.1);
+		sgd.set_gamma(0.5);
+		sgd.set_learning_rate_change_steps(1);
+		server.Start(sgd);
+	} else {
+		start_network_service_for_worker();
+		worker_load_data(cluster.worker_start());
+		for (int i=0; i<10; i++){
+			worker_update_data();
+			worker_get_data();
+		}
+		worker_send_shutdown(cluster.worker_start());
+		NetworkService::Get()->Shutdown();
+	}
+	gc->Finalize();
+	MPI_Finalize();
+	VLOG(3) << "End, process "<< gc->rank();
+	return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_blob.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_blob.cc b/src/test/model/test_blob.cc
new file mode 100644
index 0000000..75f1921
--- /dev/null
+++ b/src/test/model/test_blob.cc
@@ -0,0 +1,58 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-07-18 19:44
+#include <gtest/gtest.h>
+#include "proto/model.pb.h"
+#include "model/lapis.h"
+
+namespace lapis {
+class BlobTest : public ::testing::Test {
+ public:
+  BlobTest() : blob1(new Blob()), blob2(new Blob()) {}
+  ~BlobTest() {
+    delete blob1;
+    delete blob2;
+  }
+ protected:
+  Blob *blob1, *blob2;
+  Blob blob3, blob4;
+};
+
+TEST_F(BlobTest, Constructor) {
+  EXPECT_EQ(blob1->length(), 0);
+  EXPECT_EQ(blob1->width(), 0);
+  EXPECT_EQ(blob1->height(), 0);
+  EXPECT_EQ(blob3.length(), 0);
+  EXPECT_EQ(blob3.width(), 0);
+  EXPECT_EQ(blob3.height(), 0);
+  EXPECT_TRUE(blob2->dptr == nullptr);
+  EXPECT_TRUE(blob4.dptr == nullptr);
+}
+
+TEST_F(BlobTest, TestResize) {
+  blob1->Resize(10,1,1,1);
+  EXPECT_EQ(blob1->length(), 10);
+  EXPECT_EQ(blob1->num(), 10);
+  EXPECT_EQ(blob1->height(), 1);
+  EXPECT_EQ(blob1->width(), 1);
+  EXPECT_TRUE(blob1->dptr != nullptr);
+  blob2->Resize(4,1,1,3);
+  EXPECT_EQ(blob2->length(), 12);
+  EXPECT_EQ(blob2->num(), 4);
+  EXPECT_EQ(blob2->height(), 1);
+  EXPECT_EQ(blob2->width(), 3);
+  EXPECT_TRUE(blob2->dptr != nullptr);
+  blob3.Resize(5,1,4,3);
+  EXPECT_EQ(blob3.length(), 60);
+  EXPECT_EQ(blob3.num(), 5);
+  EXPECT_EQ(blob3.height(), 4);
+  EXPECT_EQ(blob3.width(), 3);
+  EXPECT_TRUE(blob3.dptr != nullptr);
+  blob4.Resize(6,5,4,3);
+  EXPECT_EQ(blob4.length(), 360);
+  EXPECT_EQ(blob4.num(), 6);
+  EXPECT_EQ(blob4.height(), 4);
+  EXPECT_EQ(blob4.width(), 3);
+  EXPECT_TRUE(blob4.dptr != nullptr);
+}
+
+}  // namespace lapis

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_data_layer.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_data_layer.cc b/src/test/model/test_data_layer.cc
new file mode 100644
index 0000000..49519a5
--- /dev/null
+++ b/src/test/model/test_data_layer.cc
@@ -0,0 +1,178 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-08-01 16:09
+
+#include <gtest/gtest.h>
+#include <glog/logging.h>
+#include <map>
+#include <vector>
+
+#include "model/data_layer.h"
+#include "model/trainer.h"
+#include "model/sgd_trainer.h"
+#include "model/conv_edge.h"
+#include "model/relu_layer.h"
+#include "proto/model.pb.h"
+
+#include "utils/proto_helper.h"
+
+namespace lapis {
+class ModelTest : public ::testing::Test {
+ public:
+  ModelTest () {
+    ReadProtoFromTextFile("src/test/data/model.conf", &model_proto);
+  }
+ protected:
+  ModelProto model_proto;
+};
+/**********************************************************************
+ * DataLayer Test
+ **********************************************************************/
+class DataLayerTest : public ModelTest {
+ public:
+   DataLayerTest() {
+     label_layer.Init(model_proto.net().layer(0));
+     img_layer.Init(model_proto.net().layer(1));
+     Trainer::InitDataSource(model_proto.trainer().train_data(), &sources);
+     EXPECT_EQ(2, sources.size());
+     sources[0]->LoadData(nullptr);
+     sources[1]->LoadData(nullptr);
+     DLOG(INFO)<<"after init datasources";
+     label_layer.Setup(2, TrainerProto::kBackPropagation, sources);
+     DLOG(INFO)<<"after setup label layer";
+     img_layer.Setup(2, TrainerProto::kBackPropagation, sources);
+     DLOG(INFO)<<"after setup img layer";
+   }
+   ~DataLayerTest() {
+     for(auto& source: sources)
+       delete source;
+   }
+ protected:
+  DataLayer img_layer, label_layer;
+  std::vector<DataSource*> sources;
+};
+
+TEST_F(DataLayerTest, InitSetupForward) {
+  EXPECT_TRUE(label_layer.HasInput());
+  EXPECT_TRUE(img_layer.HasInput());
+  EXPECT_STREQ("DataLayer", DataLayer::kType.c_str());
+
+  EXPECT_EQ(2, label_layer.feature(nullptr).num());
+  EXPECT_EQ(1, label_layer.feature(nullptr).channels());
+  EXPECT_EQ(1, label_layer.feature(nullptr).height());
+  EXPECT_EQ(1, label_layer.feature(nullptr).width());
+
+  EXPECT_EQ(2, img_layer.feature(nullptr).num());
+  EXPECT_EQ(3, img_layer.feature(nullptr).channels());
+  EXPECT_EQ(227, img_layer.feature(nullptr).height());
+  EXPECT_EQ(227, img_layer.feature(nullptr).width());
+
+  img_layer.Forward();
+}
+// TODO(wangwei) test this after outgoing edges are tested
+
+/**********************************************************************
+ * ConvEdge Test
+ **********************************************************************/
+class ConvEdgeTest : public DataLayerTest {
+ public:
+  ConvEdgeTest() {
+    relu.Init(model_proto.net().layer(2));
+    DLOG(INFO)<<"init both layers";
+    layer_map["input_img"]=&img_layer;
+    layer_map["hidden1_relu"]=&relu;
+
+    edge_proto=model_proto.net().edge(0);
+    convedge.Init(edge_proto, layer_map);
+    convedge.Setup(true);
+  }
+ protected:
+  std::map<std::string, Layer*> layer_map;
+  ConvEdge convedge;
+  EdgeProto edge_proto;
+  ReLULayer relu;
+};
+
+TEST_F(ConvEdgeTest, InitSetupForward) {
+  Layer* dest=layer_map.at("hidden1_relu");
+  Blob &b=dest->feature(&convedge);
+  EXPECT_EQ(0,b.num());
+  convedge.SetupTopBlob(&b);
+  int conv_height = (227 + 2 * edge_proto.pad() - edge_proto.kernel_size())
+    / edge_proto.stride() + 1;
+  int conv_width=conv_height;
+  CHECK_EQ(2, b.num());
+  CHECK_EQ(edge_proto.num_output(), b.channels());
+  CHECK_EQ(conv_height, b.height());
+  CHECK_EQ(conv_width, b.width());
+  DLOG(INFO)<<"after shape check";
+
+  Layer* src=layer_map["input_img"];
+  convedge.Forward(src->feature(&convedge), &b, true);
+}
+
+/**********************************************************************
+ * ReLULayer Test
+ **********************************************************************/
+class ReLULayerTest : public ConvEdgeTest {
+ public:
+  ReLULayerTest() {
+    relu.Setup(2, TrainerProto::kBackPropagation, sources);
+    relu_proto=model_proto.net().layer(3);
+  }
+ protected:
+  LayerProto relu_proto;
+};
+
+TEST_F(ReLULayerTest, ForwardWithoutDropout) {
+  EXPECT_EQ(2, relu.feature(&convedge).num());
+  EXPECT_EQ(2, relu.gradient(&convedge).num());
+
+  relu.Forward();
+}
+/**********************************************************************
+ * PoolingEdge Test
+class PoolingEdgeTest : public ReLULayerTest {
+ public:
+  PoolingEdgeTest() {
+    linearlayer.Init(model.net().layer(3));
+    pooledge.Init(model.net().edge(1));
+  }
+
+ protected:
+  PoolingEdge pooledge;
+  LinearLayer linearlayer;
+}
+ **********************************************************************/
+/**********************************************************************
+ * LinearLayer Test
+ **********************************************************************/
+
+/**********************************************************************
+ * LRNEdge Test
+ **********************************************************************/
+
+/**********************************************************************
+ * InnerProductEdge Test
+ **********************************************************************/
+
+/**********************************************************************
+ * SoftmaxLayerLossEdge Test
+ **********************************************************************/
+
+
+
+
+/**********************************************************************
+ * SGDTrainer Test
+ **********************************************************************/
+class SGDTrainerTest : public ModelTest {
+ protected:
+  SGDTrainer sgd;
+};
+
+TEST_F(SGDTrainerTest, Init) {
+  sgd.Init(model_proto.trainer());
+  EXPECT_TRUE(Trainer::phase==Phase::kInit);
+}
+
+}  // namespace lapis

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_label_source.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_label_source.cc b/src/test/model/test_label_source.cc
new file mode 100644
index 0000000..9b25c2a
--- /dev/null
+++ b/src/test/model/test_label_source.cc
@@ -0,0 +1,59 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-07-21 19:40
+
+#include <gtest/gtest.h>
+#include <glog/logging.h>
+#include "proto/model.pb.h"
+#include "disk/label_source.h"
+
+namespace lapis {
+class LabelSourceTest : public ::testing::Test {
+ public:
+  LabelSourceTest() {
+    DataSourceProto ds;
+    ds.set_path("src/test/data/label_source.dat");
+    ds.set_size(12);
+    ds.set_name("label source");
+    ls.Init(ds);
+  }
+
+ protected:
+  LabelSource ls;
+};
+
+TEST_F(LabelSourceTest, LoadData) {
+  auto ptr2names = ls.LoadData(nullptr);
+  EXPECT_EQ(12, ptr2names->size());
+  EXPECT_STREQ("img0.JPEG", ptr2names->at(0).c_str());
+  EXPECT_STREQ("img1.JPEG", ptr2names->at(1).c_str());
+  EXPECT_STREQ("img5.JPEG", ptr2names->at(5).c_str());
+  EXPECT_STREQ("img10.JPEG", ptr2names->at(10).c_str());
+  EXPECT_STREQ("img11.JPEG", ptr2names->at(11).c_str());
+}
+
+TEST_F(LabelSourceTest, GetData) {
+  ls.LoadData(nullptr);
+  Blob b;
+  b.Resize(1, 1, 1, 5);
+  ls.GetData(&b);
+  const float *val = b.dptr;
+  EXPECT_EQ(0.0f, val[0]);
+  EXPECT_EQ(1.0f, val[1]);
+  EXPECT_EQ(4.0f, val[2]);
+  EXPECT_EQ(9.0f, val[3]);
+  EXPECT_EQ(16.0f, val[4]);
+  ls.GetData(&b);
+  EXPECT_EQ(4.0f, val[0]);
+  EXPECT_EQ(5.0f, val[1]);
+  EXPECT_EQ(6.0f, val[2]);
+  EXPECT_EQ(7.0f, val[3]);
+  EXPECT_EQ(8.0f, val[4]);
+  ls.GetData(&b);
+  EXPECT_EQ(1.0f, val[0]);
+  EXPECT_EQ(2.0f, val[1]);
+  EXPECT_EQ(0.0f, val[2]);
+  EXPECT_EQ(1.0f, val[3]);
+  EXPECT_EQ(4.0f, val[4]);
+}
+
+}  // namespace lapis

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_param.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_param.cc b/src/test/model/test_param.cc
new file mode 100644
index 0000000..520fbe2
--- /dev/null
+++ b/src/test/model/test_param.cc
@@ -0,0 +1,138 @@
+#include <gtest/gtest.h>
+#include <glog/logging.h>
+#include "proto/model.pb.h"
+
+#include "utils/param.h"
+
+using namespace singa;
+
+class ParamTest : public ::testing::Test {
+ public:
+  ParamTest() {
+    wp.set_name("weight");
+    wp.add_shape(3);
+    wp.add_shape(4);
+    bp.set_name("bias");
+    bp.add_shape(4);
+  }
+ protected:
+  Param w, b;
+  ParamProto wp, bp;
+};
+
+TEST_F(ParamTest, ConstantInit) {
+  bp.set_init_method(ParamProto::kConstant);
+  bp.set_value(0.5);
+  b.Init(bp);
+  const float *val = b.content().dptr;
+  EXPECT_EQ(0.5f, val[0]);
+  EXPECT_EQ(0.5f, val[1]);
+  EXPECT_EQ(0.5f, val[2]);
+  EXPECT_EQ(0.5f, val[3]);
+  wp.set_init_method(ParamProto::kConstant);
+  wp.set_value(1.5);
+  w.Init(wp);
+  val = w.content().dptr;
+  EXPECT_EQ(1.5f, val[0]);
+  EXPECT_EQ(1.5f, val[3]);
+  EXPECT_EQ(1.5f, val[4]);
+  EXPECT_EQ(1.5f, val[11]);
+}
+
+TEST_F(ParamTest, UniformInit) {
+  bp.set_init_method(ParamProto::kUniform);
+  bp.set_value(1.0f);
+  b.Init(bp);
+  const float *val = b.content().dptr;
+  EXPECT_TRUE(val[0] >= -1 && val[0] <= 1);
+  EXPECT_TRUE(val[1] >= -1 && val[2] <= 1);
+  EXPECT_TRUE(val[2] >= -1 && val[2] <= 1);
+  EXPECT_TRUE(val[3] >= -1 && val[3] <= 1);
+  wp.set_init_method(ParamProto::kUniform);
+  wp.set_value(1.0f);
+  w.Init(wp);
+  val = w.content().dptr;
+  EXPECT_TRUE(val[0] >= -1 && val[0] <= 1);
+  EXPECT_TRUE(val[3] >= -1 && val[3] <= 1);
+  EXPECT_TRUE(val[4] >= -1 && val[4] <= 1);
+  EXPECT_TRUE(val[11] >= -1 && val[11] <= 1);
+}
+
+TEST_F(ParamTest, UniformSqrtFanInInit) {
+  wp.set_init_method(ParamProto::kUniformSqrtFanIn);
+  wp.set_value(2.0f);
+  w.Init(wp);
+  const float *val = w.content().dptr;
+  EXPECT_TRUE(val[0] >= -2 && val[0] <= 2);
+  EXPECT_TRUE(val[3] >= -2 && val[3] <= 2);
+  EXPECT_TRUE(val[4] >= -2 && val[4] <= 2);
+  EXPECT_TRUE(val[11] >= -2 && val[11] <= 2);
+}
+
+
+TEST_F(ParamTest, UniformSqrtFanInOutInit) {
+  wp.set_init_method(ParamProto::kUniformSqrtFanInOut);
+  wp.set_value(1.0f);
+  float low=1.0f, high=5.0f;
+  wp.set_low(low);
+  wp.set_high(high);
+  w.Init(wp);
+  const float *val = w.content().dptr;
+  /*
+  LOG(INFO) << val[0] << " " << val[1] << " " << val[2] << " " << val[3];
+  LOG(INFO) << val[4] << " " << val[5] << " " << val[6] << " " << val[7];
+  LOG(INFO) << val[8] << " " << val[9] << " " << val[10] << " " << val[11];
+  */
+  float factor = wp.value() / sqrt(wp.shape(0) + wp.shape(1));
+  low=low*factor;
+  high=high*factor;
+  LOG(INFO)<<low<<" "<<high;
+  EXPECT_TRUE(val[0] >= low && val[0] <= high);
+  EXPECT_TRUE(val[3] >= low && val[3] <= high);
+  EXPECT_TRUE(val[4] >= low && val[4] <= high);
+  EXPECT_TRUE(val[11] >= low && val[11] <= high);
+}
+
+TEST_F(ParamTest, GaussianInit) {
+  int len=5000, mean=0.0f, std=1.0f;
+  ParamProto p;
+  p.set_name("bias");
+  p.add_shape(1);
+  p.add_shape(len);
+  p.set_init_method(ParamProto::kGaussain);
+  p.set_value(1.0f);
+  p.set_mean(mean);
+  p.set_std(std);
+  w.Init(p);
+
+  const float *val = w.content().dptr;
+  float dmean=0.0f;
+  for(int i=0;i<len;i++)
+    dmean+=val[i];
+  dmean/=len;
+  float dstd=0.0f;
+  for(int i=0;i<len;i++)
+    dstd+=(dmean-val[i])*(dmean-val[i]);
+  dstd/=len;
+  EXPECT_TRUE(std::abs(mean-dmean)<0.1);
+  EXPECT_TRUE(std::abs(std-dstd)<0.1);
+  /*
+  LOG(INFO) << val[0] << " " << val[1] << " " << val[2] << " " << val[3];
+  LOG(INFO) << val[4] << " " << val[5] << " " << val[6] << " " << val[7];
+  LOG(INFO) << val[8] << " " << val[9] << " " << val[10] << " " << val[11];
+  */
+}
+
+TEST_F(ParamTest, GaussianSqrtFanInInit) {
+  wp.set_init_method(ParamProto::kGaussainSqrtFanIn);
+  wp.set_value(1.0f);
+  wp.set_mean(0);
+  wp.set_std(1.0f);
+  w.Init(wp);
+  //const float *val = w.content().dptr;
+  /*
+  LOG(INFO) << val[0] << " " << val[1] << " " << val[2] << " " << val[3];
+  LOG(INFO) << val[4] << " " << val[5] << " " << val[6] << " " << val[7];
+  LOG(INFO) << val[8] << " " << val[9] << " " << val[10] << " " << val[11];
+  */
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_proto.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_proto.cc b/src/test/model/test_proto.cc
new file mode 100644
index 0000000..f6d81fd
--- /dev/null
+++ b/src/test/model/test_proto.cc
@@ -0,0 +1,67 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-07-15 21:54
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "proto/model.pb.h"
+#include "utils/proto_helper.h"
+namespace lapis {
+
+// use const Message& m=..., otherwise may lead to segment fault
+TEST(ProtoTest, ReadFromFile) {
+  ModelProto model;
+  LOG(INFO)<<"start....";
+  lapis::ReadProtoFromTextFile("src/test/data/model.conf", &model);
+  LOG(INFO)<<"after reading file...";
+  EXPECT_STREQ("caffe_config", model.name().c_str());
+
+  // layer and edge size
+  const NetProto& net = model.net();
+  EXPECT_EQ(15, net.layer().size());
+  EXPECT_EQ(14, net.edge().size());
+  LOG(INFO)<<"after size check...";
+
+  // layer config
+  LayerProto layer1 = net.layer().Get(1);
+  EXPECT_STREQ("input_img", layer1.name().c_str());
+  EXPECT_STREQ("DataLayer", layer1.type().c_str());
+  LOG(INFO)<<"after datalayer check...";
+  // edge config
+  EdgeProto edge0 = net.edge().Get(0);
+  EXPECT_STREQ("input_img-hidden1_relu", edge0.name().c_str());
+  EXPECT_STREQ("ConvEdge", edge0.type().c_str());
+  EXPECT_EQ(2, edge0.param().size());
+  LOG(INFO)<<"after first edge check...";
+  // param config
+  ParamProto param1 = edge0.param().Get(0);
+  EXPECT_TRUE(ParamProto::kGaussain == param1.init_method());
+  EXPECT_EQ(0.0f, param1.mean());
+  EXPECT_EQ(0.01f, param1.std());
+  EXPECT_EQ(1.0f, param1.learning_rate_multiplier());
+  LOG(INFO)<<"after param of first edge check...";
+
+  ParamProto param2 = edge0.param().Get(1);
+  EXPECT_TRUE(ParamProto::kConstant == param2.init_method());
+  EXPECT_EQ(0.0f, param2.value());
+  EXPECT_EQ(0.0f, param2.weight_decay_multiplier());
+  LOG(INFO)<<"after param of second edge check...";
+
+  // trainer config
+  const TrainerProto& trainer = model.trainer();
+  const SGDProto& sgd=trainer.sgd();
+  EXPECT_EQ(227, sgd.train_batchsize());
+  EXPECT_EQ(0.01f, sgd.base_learning_rate());
+  EXPECT_TRUE(SGDProto::kStep== sgd.learning_rate_change());
+  LOG(INFO)<<"after sgd check...";
+
+  // data source config
+  EXPECT_EQ(2,trainer.train_data().size());
+  LOG(INFO)<<"after size check...";
+  const DataSourceProto& data=trainer.train_data(0);
+  LOG(INFO)<<"after get data...";
+  EXPECT_STREQ("RGBDirSource", data.type().c_str());
+  LOG(INFO)<<"after type check...";
+  EXPECT_EQ(50000, data.size());
+  EXPECT_EQ(3, data.channels());
+  LOG(INFO)<<"after data source check...";
+}
+} // namespace lapis

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/model/test_rgb_dir_source.cc
----------------------------------------------------------------------
diff --git a/src/test/model/test_rgb_dir_source.cc b/src/test/model/test_rgb_dir_source.cc
new file mode 100644
index 0000000..36ac21a
--- /dev/null
+++ b/src/test/model/test_rgb_dir_source.cc
@@ -0,0 +1,63 @@
+// Copyright © 2014 Wei Wang. All Rights Reserved.
+// 2014-07-21 21:52
+
+#include <gtest/gtest.h>
+#include <glog/logging.h>
+#include <algorithm>
+
+#include "proto/model.pb.h"
+#include "disk/rgb_dir_source.h"
+#include "disk/label_source.h"
+
+namespace lapis {
+class RGBDirSourceTest : public ::testing::Test {
+ public:
+  RGBDirSourceTest() {
+    DataSourceProto ds;
+    ds.set_path("src/test/data/rgb_dir");
+    ds.set_mean_file("src/test/data/imagenet_mean.binaryproto");
+    ds.set_size(3);
+    ds.set_height(256);
+    ds.set_width(256);
+    ds.set_offset(2);
+    ds.set_name("rgb dir source");
+    rgbs.Init(ds);
+  }
+
+ protected:
+  RGBDirSource rgbs;
+};
+
+TEST_F(RGBDirSourceTest, LoadDataNoInputKeys) {
+  auto &ptr2names = rgbs.LoadData(nullptr);
+  EXPECT_EQ(3, ptr2names->size());
+  sort(ptr2names->begin(), ptr2names->end());
+  EXPECT_STREQ("img0.JPEG", ptr2names->at(0).c_str());
+  EXPECT_STREQ("img1.JPEG", ptr2names->at(1).c_str());
+  EXPECT_STREQ("img2.JPEG", ptr2names->at(2).c_str());
+}
+
+TEST_F(RGBDirSourceTest, LoadDataWithInputKeys) {
+  LabelSource ls;
+  DataSourceProto ds;
+  ds.set_path("src/test/data/label_source.dat");
+  ds.set_name("label source");
+  ds.set_size(3);
+  ls.Init(ds);
+  auto ptr2names1 = ls.LoadData(nullptr);
+  auto ptr2names2 = rgbs.LoadData(ptr2names1);
+  EXPECT_EQ(3, ptr2names2->size());
+  for (int i = 0; i < 3; i++)
+    EXPECT_STREQ(ptr2names1->at(i).c_str(), ptr2names2->at(i).c_str());
+}
+
+TEST_F(RGBDirSourceTest, GetData) {
+  Blob b;
+  b.Resize(256,256,3,2);
+  rgbs.LoadData(nullptr);
+  rgbs.GetData(&b);
+  rgbs.GetData(&b);
+  rgbs.GetData(&b);
+}
+}  // namespace lapis
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/test_cluster.cc
----------------------------------------------------------------------
diff --git a/src/test/test_cluster.cc b/src/test/test_cluster.cc
new file mode 100644
index 0000000..d86463a
--- /dev/null
+++ b/src/test/test_cluster.cc
@@ -0,0 +1,95 @@
+#include <fstream>
+#include "gtest/gtest.h"
+#include "proto/cluster.pb.h"
+#include "utils/cluster.h"
+
+using namespace singa;
+
+string folder="src/test/data/";
+/*
+ClusterProto GenClusterProto(){
+  ClusterProto proto;
+  int nworker=6, nserver=4;
+  proto.set_nworkers(nworker);
+  proto.set_nservers(nserver);
+  proto.set_nworkers_per_group(3);
+  proto.set_nservers_per_group(2);
+  proto.set_nthreads_per_worker(1);
+  proto.set_nthreads_per_server(2);
+
+  proto.set_hostfile(folder+"/hostfile");
+
+  std::ofstream fout(folder+"/hostfile", std::ofstream::out);
+  for(int i=0;i<nworker+nserver;i++){
+    char tmp[20];
+    sprintf(tmp, "awan-0-%02d-0", i);
+    fout<<tmp<<std::endl;
+  }
+  fout.flush();
+  fout.close();
+  return proto;
+}
+
+TEST(ClusterTest, NoServer){
+  ClusterProto proto=GenClusterProto();
+  proto.set_nservers(0);
+  auto cluster=Cluster::Get(proto, 0);
+  ASSERT_EQ(proto.nworkers(),cluster->nworkers());
+  ASSERT_EQ(0, cluster->nservers());
+  ASSERT_EQ(proto.nworkers_per_group(),cluster->nworkers_per_group());
+  ASSERT_EQ(proto.nservers_per_group(),cluster->nservers_per_group());
+  ASSERT_FALSE(cluster->AmIServer());
+  ASSERT_TRUE(cluster->AmIWorker());
+  ASSERT_EQ(0,cluster->group_procs_id());
+  ASSERT_EQ(0,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(0, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-00-0", cluster->host_addr().c_str());
+
+  cluster=Cluster::Get(proto, 5);
+  ASSERT_EQ(2,cluster->group_procs_id());
+  ASSERT_EQ(1,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(0, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-05-0", cluster->host_addr().c_str());
+}
+
+TEST(ClusterTest, SingleServerGroup){
+  ClusterProto proto=GenClusterProto();
+  proto.set_nservers(2);
+  auto cluster=Cluster::Get(proto, 3);
+  ASSERT_FALSE(cluster->AmIServer());
+  ASSERT_TRUE(cluster->AmIWorker());
+  ASSERT_EQ(0,cluster->group_procs_id());
+  ASSERT_EQ(1,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(1, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-03-0", cluster->host_addr().c_str());
+
+  cluster=Cluster::Get(proto, 7);
+  ASSERT_EQ(1,cluster->group_procs_id());
+  ASSERT_EQ(0,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(1, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-07-0", cluster->host_addr().c_str());
+}
+
+TEST(ClusterTest, MultiServerGroups){
+  ClusterProto proto=GenClusterProto();
+  auto cluster=Cluster::Get(proto, 7);
+  ASSERT_EQ(1,cluster->group_procs_id());
+  ASSERT_EQ(0,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(2, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-07-0", cluster->host_addr().c_str());
+
+  cluster=Cluster::Get(proto, 8);
+  ASSERT_TRUE(cluster->AmIServer());
+  ASSERT_FALSE(cluster->AmIWorker());
+  ASSERT_EQ(0,cluster->group_procs_id());
+  ASSERT_EQ(1,cluster->group_id());
+  ASSERT_EQ(2, cluster->nworker_groups());
+  ASSERT_EQ(2, cluster->nserver_groups());
+  ASSERT_STREQ("awan-0-08-0", cluster->host_addr().c_str());
+}
+*/

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/test_communication.cc
----------------------------------------------------------------------
diff --git a/src/test/test_communication.cc b/src/test/test_communication.cc
new file mode 100644
index 0000000..c9c035f
--- /dev/null
+++ b/src/test/test_communication.cc
@@ -0,0 +1,158 @@
+#include <thread>
+#include <vector>
+#include "gtest/gtest.h"
+#include "communication/msg.h"
+#include "communication/socket.h"
+using std::vector;
+using namespace singa;
+
+const char* ping="PING",*pong="PONG";
+/**
+ * Connect dealer with (gid, id, flag) to stub router
+ */
+void Connect(Dealer* dealer, int gid, int id, int flag){
+  dealer->Connect("inproc://router");
+  Msg msg;
+  msg.set_src(gid, id, flag);
+  msg.set_dst(0,0,2);
+  msg.set_type(0);
+  msg.add_frame(ping, 4);
+  dealer->Send(&msg);
+}
+
+/**
+ * Dealer thread, ping-pong with the stub router
+ */
+void DealerPingPong(int id){
+  Dealer* dealer=new Dealer();
+  Connect(dealer, 0, id, 0);
+  Msg* msg=dealer->Receive();
+  int flag=msg->src_flag();
+  ASSERT_EQ(2, flag);
+  ASSERT_EQ(0, msg->dst_group_id());
+  ASSERT_EQ(id, msg->dst_id());
+  ASSERT_STREQ(pong, (char*)msg->frame_data());
+  delete msg;
+  delete dealer;
+}
+
+/**
+ * Worker thread, connect to router and communicate with server thread
+ */
+void WorkerDealer(int sid, int did){
+  Dealer* dealer=new Dealer();
+  Connect(dealer, 0, sid, 0);
+  for(int i=0;i<2;i++){
+    {
+      Msg msg;
+      msg.set_src(0, sid, 0);
+      msg.set_dst(0, did, 1);
+      msg.set_type(3);
+      msg.set_target(i);
+      dealer->Send(&msg);
+    }
+    {
+      Msg *msg=dealer->Receive();
+      ASSERT_EQ(0, msg->src_group_id());
+      ASSERT_EQ(did, msg->src_id());
+      ASSERT_EQ(1, msg->src_flag());
+      delete msg;
+    }
+  }
+  delete dealer;
+}
+
+/**
+ * Server thread, connect to router and communicate with worker thread
+ */
+void ServerDealer(int id, int n){
+  Dealer* dealer=new Dealer();
+  Connect(dealer, 0, id, 1);
+  for(int i=0;i<n;i++){
+    Msg *msg=dealer->Receive();
+    Msg reply;
+    reply.set_dst(msg->src_group_id(), msg->src_id(), msg->src_flag());
+    reply.set_src(0, id, 1);
+    dealer->Send(&reply);
+    delete msg;
+  }
+  delete dealer;
+}
+
+TEST(CommunicationTest, DealerRouterPingPong){
+  int n=2;
+  vector<std::thread> threads;
+  for(int i=0;i<n;i++)
+    threads.push_back(std::thread(DealerPingPong, i));
+  Router* router=new Router();
+  router->Bind("");
+  for(int k=0;k<n;k++){
+    Msg* msg=router->Receive();
+    ASSERT_EQ(0, msg->src_group_id());
+    ASSERT_EQ(2, msg->dst_flag());
+    ASSERT_STREQ(ping, (char*)msg->frame_data());
+
+    Msg reply;
+    reply.set_src(0,0,2);
+    reply.set_dst(msg->src_group_id(), msg->src_id(), msg->src_flag());
+    reply.add_frame(pong, 4);
+    router->Send(&reply);
+    delete msg;
+  }
+
+  delete router;
+  for(auto& thread:threads)
+    thread.join();
+}
+
+TEST(CommunicationTest, nWorkers1Server){
+  int nworker=2;
+  vector<std::thread> threads;
+  for(int i=0;i<nworker;i++)
+    threads.push_back(std::thread(WorkerDealer, i, 0));
+  //threads.push_back(std::thread(ServerDealer, 0, 4));
+  Router* router=new Router();
+  router->Bind("");
+  int nmsg=4*nworker;
+  int k=0;
+  while(nmsg>0){
+    Msg* msg=router->Receive();
+    if(2== msg->dst_flag()){
+      ASSERT_STREQ(ping, (char*)msg->frame_data());
+      k++;
+      if(k==nworker)
+        threads.push_back(std::thread(ServerDealer, 0, 2*nworker));
+    }else{
+      nmsg--;
+      router->Send(msg);
+    }
+    delete msg;
+  }
+  delete router;
+  for(auto& thread:threads)
+    thread.join();
+}
+
+TEST(CommunicationTest, 2Workers2Server){
+  vector<std::thread> threads;
+  threads.push_back(std::thread(WorkerDealer, 0, 0));
+  threads.push_back(std::thread(WorkerDealer, 1, 1));
+  threads.push_back(std::thread(ServerDealer, 0, 2));
+  threads.push_back(std::thread(ServerDealer, 1, 2));
+  Router* router=new Router();
+  router->Bind("");
+  int n=8;
+  while(n>0){
+    Msg* msg=router->Receive();
+    if(2== msg->dst_flag()){
+      ASSERT_STREQ(ping, (char*)msg->frame_data());
+    }else{
+      n--;
+      router->Send(msg);
+    }
+    delete msg;
+  }
+  delete router;
+  for(auto& thread:threads)
+    thread.join();
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/test/test_shard.cc
----------------------------------------------------------------------
diff --git a/src/test/test_shard.cc b/src/test/test_shard.cc
new file mode 100644
index 0000000..c96d876
--- /dev/null
+++ b/src/test/test_shard.cc
@@ -0,0 +1,56 @@
+#include <gtest/gtest.h>
+#include <sys/stat.h>
+
+#include "utils/data_shard.h"
+
+std::string key[]={"firstkey","secondkey","3key", "key4", "key5"};
+std::string tuple[]={"firsttuple","2th-tuple","thridtuple", "tuple4", "tuple5"};
+
+using namespace singa;
+
+TEST(DataShardTest, CreateDataShard){
+  std::string path="src/test/data/shard_test";
+  mkdir(path.c_str(), 0755);
+  DataShard shard(path, DataShard::kCreate, 50);
+  shard.Insert(key[0], tuple[0]);
+  shard.Insert(key[1], tuple[1]);
+  shard.Insert(key[2], tuple[2]);
+  shard.Flush();
+}
+
+TEST(DataShardTest, AppendDataShard){
+  std::string path="src/test/data/shard_test";
+  DataShard shard(path, DataShard::kAppend, 50);
+  shard.Insert(key[3], tuple[3]);
+  shard.Insert(key[4], tuple[4]);
+  shard.Flush();
+}
+TEST(DataShardTest, CountDataShard){
+  std::string path="src/test/data/shard_test";
+  DataShard shard(path, DataShard::kRead, 50);
+  int count=shard.Count();
+  ASSERT_EQ(5, count);
+}
+
+TEST(DataShardTest, ReadDataShard){
+  std::string path="src/test/data/shard_test";
+  DataShard shard(path, DataShard::kRead, 50);
+  std::string k, t;
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_STREQ(key[0].c_str(), k.c_str());
+  ASSERT_STREQ(tuple[0].c_str(), t.c_str());
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_STREQ(key[1].c_str(), k.c_str());
+  ASSERT_STREQ(tuple[1].c_str(), t.c_str());
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_STREQ(key[4].c_str(), k.c_str());
+  ASSERT_STREQ(tuple[4].c_str(), t.c_str());
+
+  ASSERT_FALSE(shard.Next(&k, &t));
+  shard.SeekToFirst();
+  ASSERT_TRUE(shard.Next(&k, &t));
+  ASSERT_STREQ(key[0].c_str(), k.c_str());
+  ASSERT_STREQ(tuple[0].c_str(), t.c_str());
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/trainer/pm_server.cc
----------------------------------------------------------------------
diff --git a/src/trainer/pm_server.cc b/src/trainer/pm_server.cc
new file mode 100644
index 0000000..28fa28d
--- /dev/null
+++ b/src/trainer/pm_server.cc
@@ -0,0 +1,99 @@
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "trainer/pm_server.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include <vector>
+
+using std::vector;
+
+namespace singa{
+void PMServer::Setup(int group_id, int server_id, shared_ptr<ParamShard> shard,
+      const UpdaterProto& proto){
+  group_id_=group_id;
+  server_id_=server_id;
+  shard_=shard;
+  updater_=shared_ptr<Updater>(Singleton<Factory<Updater>>::Instance()
+      ->Create("Updater"));
+  updater_->Init(proto);
+}
+
+PMServer::~PMServer(){
+}
+
+bool PMServer::SyncNow(){
+  return false;
+}
+Msg* PMServer::HandlePut(Msg **msg){
+  int id=(*msg)->target();
+  shared_ptr<Param> param=nullptr;
+  if(shard_->find(id)!=shard_->end()){
+    LOG(ERROR)<<"Param ("<<id<<") is put more than once";
+    param=shard_->at(id);
+  }else{
+    param=shared_ptr<Param>(Singleton<Factory<Param>>::Instance()
+        ->Create("Param"));
+    param->set_id(id);
+    (*shard_)[id]=param;
+  }
+  return param->HandlePutMsg(msg);
+}
+
+Msg* PMServer::HandleGet(Msg **msg){
+  int id=(*msg)->target();
+  shared_ptr<Param> param=nullptr;
+  if(shard_->find(id)!=shard_->end()){
+    param=shard_->at(id);
+    return param->HandleGetMsg(msg);
+	} else {
+		//re-construct msg to be re-queued.
+		//the calling function will send this message off
+    return *msg;
+	}
+}
+
+Msg* PMServer::HandleUpdate(Msg **msg) {
+  int id=(*msg)->target();
+  shared_ptr<Param> param=nullptr;
+  if(shard_->find(id)!=shard_->end()){
+		//repsonse of the format: <identity><type: kData><paramId><param content>
+    param=shard_->at(id);
+    Msg* tmp=static_cast<Msg*>((*msg)->CopyAddr());
+    param->ParseUpdateMsg(msg);
+    updater_->Update(param->version(), param);
+    param->set_version(param->version()+1);
+    auto response=param->GenUpdateResponseMsg();
+    tmp->SwapAddr();
+    response->SetAddr(tmp);
+    delete tmp;
+    return response;
+	} else {
+    LOG(ERROR)<<"Param ("<<id<<") is not maintained by server ("<<group_id_
+      <<", "<<server_id_<<")";
+		//re-construct msg to be re-queued.
+		return *msg;
+	}
+}
+
+Msg* PMServer::HandleSyncRequest(Msg **msg){
+  int id=(*msg)->target();
+  shared_ptr<Param> param=nullptr;
+  if(shard_->find(id)!=shard_->end()){
+		//repsonse of the format: <identity><type: kData><paramId><param content>
+    param=shard_->at(id);
+    return param->HandleSyncMsg(msg);
+	} else {
+		//re-construct msg to be re-queued.
+    return *msg;
+	}
+}
+
+int PMServer::HandleSyncResponse(Msg **msg){
+  int id=(*msg)->target();
+  CHECK(shard_->find(id)!=shard_->end());
+  return shard_->at(id)->ParseSyncResponseMsg(msg);
+}
+
+} // namespace singa
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/trainer/pm_worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/pm_worker.cc b/src/trainer/pm_worker.cc
new file mode 100644
index 0000000..7269578
--- /dev/null
+++ b/src/trainer/pm_worker.cc
@@ -0,0 +1,344 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "gflags/gflags.h"
+#include <glog/logging.h>
+#include "proto/model.pb.h"
+#include "trainer/pm_worker.h"
+#include "mshadow/tensor.h"
+#include "utils/cluster.h"
+
+
+namespace singa{
+
+void PMWorker::Setup(int group_id, int worker_id,
+    shared_ptr<ParamShard> shard){
+  group_id_=group_id;
+  worker_id_=worker_id;
+  shard_=shard;
+}
+int PMWorker::Sharding(int param_id){
+  return param_id%Cluster::Get()->nservers_per_group();
+}
+/*
+int PMWorker::Sharding(int param_id){
+  static map<int, int> id2procs;
+  if(id2procs.find(param_id)==id2procs.end()){
+  auto cluster=Cluster::Get();
+  int server_group=group_id_%cluster->nserver_groups();
+  int nprocs_per_server_group=
+    cluster->nservers_per_group()/cluster->nservers_per_procs();
+  int procsid=server_group*nprocs_per_server_group+
+    param_id%nprocs_per_server_group;
+  procsid= cluster->server_worker_separate()?
+    cluster->nworker_procs()+procsid:procsid;
+  id2procs[param_id]=procsid;
+  }
+  return id2procs[param_id];
+}
+*/
+
+Msg* PMWorker::Put(Msg** msg){
+  return *msg;
+}
+
+Msg* PMWorker::Put(shared_ptr<Param> param, int step){
+  param->set_version(step);
+  // only owner can put shared parameter
+  if(param->owner()<0||param->owner()==param->id()){
+    Msg* msg= param->GenPutMsg(&step);
+    msg->set_src(group_id_, worker_id_, kWorkerParam);
+    msg->set_dst(group_id_/Cluster::Get()->nworker_groups_per_server_group(),
+        Sharding(param->id()), kServer);
+    msg->set_type(kPut);
+    msg->set_target(param->id());
+    return msg;
+  }else
+    return nullptr;
+}
+
+Msg* PMWorker::Get(Msg** msg){
+  return *msg;
+}
+
+Msg* PMWorker::Get(shared_ptr<Param> param, int step){
+  param->set_version(step);
+  bool send=false;
+  int id=param->id();
+  shared_ptr<ParamCounter> entry=nullptr;
+  if(param->owner()>=0){
+    entry=shard_->at(id);
+    entry->nGet++;
+    send=entry->nGet/entry->nLocal==step;
+  }
+  if(param->owner()<0||send){
+    Msg* msg=nullptr;
+    if(param->owner()<0){
+      msg=param->GenGetMsg(&step);
+      msg->set_dst(group_id_/Cluster::Get()->nworker_groups_per_server_group(),
+          Sharding(id), kServer);
+    } else {
+      msg=entry->param->GenGetMsg(&step);
+      msg->set_dst(entry->owner_procs,kStub);
+    }
+    msg->set_src(group_id_, worker_id_, kWorkerParam);
+    msg->set_type(kGet);
+    msg->set_target(id);
+    return msg;
+  }else
+    return nullptr;
+}
+
+Msg* PMWorker::Update(Msg** msg){
+  return *msg;
+}
+Msg* PMWorker::Update(shared_ptr<Param> param, int step){
+  param->set_version(step);
+  bool send=false;
+  int id=param->id();
+  shared_ptr<ParamCounter> entry;
+  if(param->owner()>=0){
+    entry=shard_->at(param->id());
+    entry->nGet++;
+    send=entry->nGet/entry->nLocal==step;
+    auto shape=mshadow::Shape1(param->size());
+    mshadow::Tensor<mshadow::cpu,1> grad(param->mutable_cpu_grad(), shape);
+    mshadow::Tensor<mshadow::cpu,1> agg(entry->param->mutable_cpu_grad(), shape);
+    agg+=grad;
+  }
+  if(param->owner()<0||send){
+    Msg* msg=nullptr;
+    if(param->owner()<0){
+      msg=param->GenUpdateMsg(&step);
+      msg->set_dst(group_id_/Cluster::Get()->nworker_groups_per_server_group(),
+          Sharding(id), kServer);
+    } else {
+      entry->param->GenUpdateMsg(&step);
+      msg->set_dst(entry->owner_procs,kStub);
+      memset(param->mutable_cpu_data(), 0, sizeof(float)*param->size());
+    }
+    msg->set_type(kUpdate);
+    msg->set_target(id);
+    msg->set_src(group_id_, worker_id_, kWorkerParam);
+    return msg;
+  }else
+    return nullptr;
+}
+
+Msg* PMWorker::Collect(Msg** msg){
+  int id=(*msg)->target();
+  int type=(*msg)->type();
+  auto pp=shard_->at(id)->param;
+  if(type==kRGet){
+    pp->ParseGetResponseMsg(msg);
+  }else if(type==kRUpdate){
+    pp->ParseUpdateResponseMsg(msg);
+  }
+  if(pp->owner()>=0){
+    // forwarding to workers on other procs
+  }
+  delete (*msg);
+  *msg=nullptr;
+  return nullptr;
+}
+
+/*
+//id is the global worker id
+SingaClient::SingaClient(int global_id, Topology &topology, vector<string> &hosts) {
+	//Read the config files and store endpoints
+	id_ = global_id;
+
+	int n_workers = hosts.size() - topology.nservers();
+	int n_worker_groups = topology.nworker_groups();
+	int group_size = n_workers/n_worker_groups;
+	int server_group_size = topology.nservers()/topology.server_group_size();
+	FLAGS_client_threads = topology.worker_threads();
+
+	local_id_ = (id_-topology.nservers())%group_size;//local worker id.
+	group_id_ = (id_-topology.nservers())/group_size;
+
+	VLOG(3) << "Parsing client config for "<<hosts[id_];
+
+	//connect to all server in the server group group_id_
+	int start_server_idx = group_id_*server_group_size;
+	int end_server_idx = start_server_idx+server_group_size;
+
+	for (int i = start_server_idx; i < end_server_idx; i++) {
+		char *neighbor_endpoint = (char*) malloc(256);
+		sprintf(neighbor_endpoint, "tcp://%s:%d", hosts[i].c_str(), topology.port());
+		neighbors_.push_back(neighbor_endpoint);
+		VLOG(3) << "Worker neighbor (server): "<<neighbor_endpoint;
+	}
+
+	sprintf(backend_endpoint_, "inproc://singanus%d",id_);
+
+	//Create shared paramshard
+	param_shard_ = new ParamShard(id_,0);
+}
+
+void SingaClient::StartClient(){
+	//Create and connect sockets to the server
+	vector<void *> server_sockets;
+	zctx_t *context = zctx_new();
+	int nservers = neighbors_.size();
+	int rc;
+	for (int i=0; i<nservers; i++){
+		void *socket = zsocket_new(context, ZMQ_DEALER);
+		rc = zsocket_connect(socket, neighbors_[i]);
+		VLOG(3) << "Connected to neighbor " <<neighbors_[i];
+		assert(rc==0);
+		server_sockets.push_back(socket);
+	}
+
+	//Create and bind backend socket
+	void *backend = zsocket_new(context, ZMQ_ROUTER);
+	rc = zsocket_bind(backend, backend_endpoint_);
+	assert(rc==0);
+
+	//Start client threads
+	for (int i=0; i<FLAGS_client_threads; i++){
+		void * socket = zthread_fork(context, ClientThread, this);
+		zmsg_t *control_msg = zmsg_new();
+		if (i==0 && local_id_==0)
+			zmsg_pushstr(control_msg,POPULATE);
+		else
+			zmsg_pushstr(control_msg, WAIT);
+		zmsg_send(&control_msg, socket);
+	}
+
+	//Star the message loop
+	bool is_running = true;
+	int nsockets= nservers+1;
+	while (is_running) {
+		zmq_pollitem_t items[nsockets];
+		for (int i = 0; i < nsockets-1; i++)
+			items[i] = {server_sockets[i], 0, ZMQ_POLLIN, 0};
+		items[nsockets-1] = {backend, 0, ZMQ_POLLIN, 0};
+
+		int rc = zmq_poll(items,nsockets,-1);
+		if (rc<0) break;
+
+		for (int i=0; i<nsockets-1; i++){
+			if (items[i].revents & ZMQ_POLLIN){
+				zmsg_t *msg = zmsg_recv(server_sockets[i]);
+				if (!msg){
+					is_running = false;
+					break;
+				}
+				//forward to backend
+				zmsg_send(&msg, backend);
+			}
+		}
+		if (items[nsockets-1].revents & ZMQ_POLLIN){
+			//compute serverId from paramId and forward to the socket
+			zmsg_t *msg = zmsg_recv(backend);
+			if (!msg) is_running=false;
+			zframe_t *identity = zmsg_pop(msg);
+			zframe_t *type = zmsg_pop(msg);
+			int paramId;
+			sscanf(zmsg_popstr(msg), "%d", &paramId);
+			zmsg_pushstrf(msg,"%d",paramId);
+			zmsg_prepend(msg,&type);
+			zmsg_prepend(msg,&identity);
+			zmsg_send(&msg, server_sockets[param_to_server_id(paramId)]);
+		}
+	}
+
+	zsocket_destroy(context, backend);
+	for (int i=0; i<nsockets-1; i++)
+		zsocket_destroy(context, server_sockets[i]);
+	zctx_destroy(&context);
+}
+
+vector<Param*> gen_random_params() {
+	int size[] = { 1960000, 2500, 5000000, 2000, 3000000, 1500, 1500000, 1000, 500000, 500, 5000, 10 };
+	vector<Param*> params;
+	for (int i = 0; i < 12; i++) {
+		ParamProto proto;
+		proto.set_id(i);
+		proto.set_init_method(ParamProto::kGaussain);
+		Param* p = new Param();
+		p->Setup(proto, vector<int> { size[i] }, 0);
+		p->Init();
+		params.push_back(p);
+	}
+	return params;
+}
+
+//simple mapping
+int SingaClient::param_to_server_id(int paramId){
+	return paramId % neighbors_.size();
+}
+
+void ClientThread(void *args, zctx_t *ctx, void *pipe){
+	SingaClient *client = static_cast<SingaClient*>(args);
+
+	//Create back-end socket and connect to the main thread
+	void *backend = zsocket_new(ctx, ZMQ_DEALER);
+	int rc = zsocket_connect(backend, client->backend_endpoint());
+	assert(rc==0);
+	//Create PMClient object
+	PMClient *pmclient = new PMClient(client->id(), client->param_shard(), backend);
+
+	//FOR TESTING ONLY. REMOVE THIS!
+	//wait for control from main thread
+	vector<Param*> params = gen_random_params();
+	zmsg_t *control_msg = zmsg_recv(pipe);
+	zframe_t *msg = zmsg_pop(control_msg);
+	if (zframe_streq(msg,WAIT))
+		zclock_sleep(2000); //2s
+	else{
+		for (int i=0; i<params.size(); i++){
+			pmclient->Put(i, params[i]);
+		}
+		VLOG(3)<<"Done PUT requests for populating servers.";
+		zclock_sleep(2000);
+	}
+	zframe_destroy(&msg);
+	//END TESTING
+	LOG(ERROR) << "Done putting";
+
+	//first, get the params
+
+	test_get(pmclient);
+	test_collect(pmclient);
+
+
+	int iterations = 1;
+	while (iterations<=200){
+		VLOG(3) << "Iteration "<<iterations;
+		test_update(pmclient, params);
+		test_collect(pmclient);
+		iterations++;
+	}
+
+	zsocket_destroy(ctx, backend);
+}
+
+void test_get(PMClient *client){
+	for (int i=0; i<12; i++){
+		Param pm;
+		int status = client->Get(i, &pm);
+		assert(status==NON_LOCAL);
+	}
+}
+
+void test_collect(PMClient *client){
+	for (int i=0; i<12; i++){
+		Param pm;
+		int64_t start_time = zclock_time();
+		while (!client->Collect(&pm))
+			zclock_sleep(1);
+		int64_t end_time = zclock_time();
+		VLOG(3) << "Collected: " <<(end_time-start_time);
+	}
+}
+
+void test_update(PMClient *client, vector<Param*> params){
+	for (int i=0; i<params.size(); i++)
+		client->Update(i, params[i]);
+}
+*/
+
+
+} //namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/trainer/server.cc
----------------------------------------------------------------------
diff --git a/src/trainer/server.cc b/src/trainer/server.cc
new file mode 100644
index 0000000..bf0ad03
--- /dev/null
+++ b/src/trainer/server.cc
@@ -0,0 +1,68 @@
+#include <list>
+#include <tuple>
+#include <queue>
+#include "trainer/server.h"
+#include "utils/param.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "utils/cluster.h"
+
+
+namespace singa {
+Server::Server(int group_id, int server_id):
+  group_id_(group_id), server_id_(server_id){}
+
+void Server::Setup(const UpdaterProto& proto,
+    shared_ptr<PMServer::ParamShard> shard,
+    shared_ptr<Dealer> dealer){
+	//VLOG(3) << "Parsing config file for host "<<hosts[id_] << " server id = " <<id_;
+  pmserver_=shared_ptr<PMServer>(Singleton<Factory<PMServer>>::Instance()
+      ->Create("PMServer"));
+  pmserver_->Setup(group_id_, server_id_, shard, proto);
+  dealer_=dealer;
+}
+
+void Server::Run(){
+  Msg* ping=new Msg();
+  ping->set_src(group_id_, server_id_, kServer);
+  ping->set_dst(0,0,kStub);
+  ping->set_type(kConnect);
+  dealer_->Send(ping);
+  int timeout=Cluster::Get()->server_timeout();
+  Poller poller;
+  poller.Add(dealer_.get());
+	//start recv loop and process requests
+  while (true){
+    Msg* msg=dealer_->Receive();
+    if (msg==nullptr)
+      break;
+    Msg* response=nullptr;
+    int type=msg->type();
+    switch (type){
+      case kPut:
+        response = pmserver_->HandlePut(&msg);
+        break;
+      case kGet:
+        response = pmserver_->HandleGet(&msg);
+        break;
+      case kUpdate:
+        response = pmserver_->HandleUpdate(&msg);
+        break;
+      case kSyncRequest:
+        VLOG(3)<<"Handle SYNC-REQUEST";
+        response = pmserver_->HandleSyncRequest(&msg);
+        break;
+      case kSyncResponse:
+        VLOG(3) << "Handle SYNC response";
+        pmserver_->HandleSyncResponse(&msg);
+        break;
+    }
+
+    if (response!=nullptr)
+      dealer_->Send(response);
+  }
+}
+
+
+
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
new file mode 100644
index 0000000..3621b7e
--- /dev/null
+++ b/src/trainer/trainer.cc
@@ -0,0 +1,206 @@
+#include <thread>
+#include <vector>
+#include <map>
+#include <glog/logging.h>
+#include "trainer/trainer.h"
+using std::vector;
+using std::map;
+
+namespace singa {
+int ProcsIDOf(int group_id, int id, int flag){
+  int procsid;
+  auto cluster=Cluster::Get();
+  if(flag==kServer){
+    procsid=group_id*cluster->nservers_per_group()/
+      cluster->nservers_per_procs()+id/cluster->nservers_per_procs();
+    if(cluster->server_worker_separate())
+      procsid+=cluster->nworker_procs();
+  }else if(flag==kWorkerLayer || flag==kWorkerParam){
+    procsid=group_id*cluster->nworkers_per_group()
+      /cluster->nworkers_per_procs();
+    if(cluster->nworkers_per_group()>cluster->nworkers_per_procs())
+      procsid+=id/cluster->nworkers_per_procs();
+  }else{
+    LOG(ERROR)<<"Unkown flag ("<<flag<<")";
+  }
+  return procsid;
+}
+
+void Trainer::RegisterDefaultClasses(const singa::ModelProto& proto){
+  // register all layers appearing in the neural net
+  singa::NeuralNet::RegisterLayers();
+  Singleton<Factory<singa::Param>>::Instance()->Register(
+      "Param", CreateInstance(singa::Param, singa::Param));
+  Singleton<Factory<singa::Updater>>::Instance() ->Register(
+      "Updater", CreateInstance(singa::SGDUpdater, singa::Updater));
+  Singleton<Factory<singa::PMWorker>>::Instance() ->Register(
+      "PMWorker", CreateInstance(singa::PMWorker, singa::PMWorker));
+  Singleton<Factory<singa::PMServer>>::Instance() ->Register(
+      "PMServer", CreateInstance(singa::PMServer, singa::PMServer));
+  Singleton<Factory<singa::PMServer>>::Instance() ->Register(
+      "PMServer", CreateInstance(singa::PMServer, singa::PMServer));
+}
+
+void Trainer::Start(const ModelProto& mproto, const ClusterProto& cproto,
+    int procs_id){
+  RegisterDefaultClasses(mproto);
+
+  auto cluster=Cluster::Get(cproto, procs_id);
+  // create servers
+  vector<shared_ptr<Server>> servers;
+  int nSocket=1; // the first socket is the router
+  if(cluster->has_server()){
+    int pid=cluster->procs_id();
+    if(cluster->server_worker_separate())
+      pid-=cluster->nworker_procs();
+    int gid=pid*cluster->nservers_per_procs()/cluster->nservers_per_group();
+    int start=pid*cluster->nservers_per_procs()%cluster->nservers_per_group();
+    int end=start+cluster->nservers_per_group();
+    // the ParamShard for servers consists of a dictionary of Param objects
+    auto shard=make_shared<PMServer::ParamShard>();
+    for(int sid=start;sid<end;sid++){
+      auto server=make_shared<Server>(gid, sid);
+      auto dealer=make_shared<Dealer>(nSocket++);
+      dealer->Connect(kInprocRouterEndpoint);
+      server->Setup(mproto.updater(), shard, dealer);
+      servers.push_back(server);
+    }
+  }
+
+  // create workers
+  vector<shared_ptr<Worker>> workers;
+  if(cluster->has_worker()){
+    auto net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTrain);
+    int pid=cluster->procs_id();
+    int gstart, gend, wstart, wend;
+    if(cluster->nworkers_per_group()>=cluster->nworkers_per_procs()){
+      // all workers in this procs are from the same group
+      gstart=pid*cluster->nworkers_per_procs()/cluster->nworkers_per_group();
+      gend=gstart+1;
+      wstart=pid*cluster->nworkers_per_procs()%cluster->nworkers_per_group();
+      wend=wstart+cluster->nworkers_per_group();
+    }else{
+      // there are multiple groups in this procs
+      CHECK_EQ(cluster->nworkers_per_procs()%cluster->nworkers_per_group(),0);
+      int groups_per_procs=
+        cluster->nworkers_per_procs()/cluster->nworkers_per_group();
+      gstart=pid*groups_per_procs;
+      gend=(pid+1)*groups_per_procs;
+      wstart=0;
+      wend=cluster->nworkers_per_group();
+    }
+    for(int gid=gstart;gid<gend;gid++){
+      shared_ptr<NeuralNet> train_net, test_net, validation_net;
+      if(gid==gstart)
+        train_net=net;
+      else{
+        train_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTrain);
+        // the train net for other groups may share parameter values from the
+        // first group
+        if(mproto.hogwild())
+          train_net->ShareParams(net, kValueOnly);
+      }
+      if(gid==0){
+        // validation and test are performed only by the first group
+        if(mproto.test_steps()){
+          test_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kTest);
+          if(test_net!=nullptr)
+            test_net->ShareParams(train_net, kValueOnly);
+        }
+        if(mproto.validation_steps()){
+          validation_net=NeuralNet::SetupNeuralNet(mproto.neuralnet(), kValidation);
+          if(validation_net!=nullptr)
+            validation_net->ShareParams(train_net, kValueOnly);
+        }
+      }
+      // create ParamShard for the workers
+      auto shard=make_shared<PMWorker::ParamShard>();
+      for(auto layer: train_net->layers()){
+        int procsid=ProcsIDOf(gid, layer->locationid(),kWorkerParam);
+        int local=procsid==cluster->procs_id();
+        for(auto param: layer->GetParams()){
+          int owner=param->owner()<0||param->owner()==param->id()?procsid:-1;
+          if(shard->find(param->id())==shard->end())
+            (*shard)[param->id()]=make_shared<ParamCounter>(param, local, owner);
+          else
+            shard->at(param->id())->AddParam(param, local, owner);
+        }
+      }
+      for(int wid=wstart;wid<wend;wid++){
+        shared_ptr<Worker> worker=nullptr;
+        if(mproto.alg()==ModelProto_GradCalcAlg_kBackPropagation)
+          worker=make_shared<BPWorker>(gid, wid);
+        else{
+        // TODO add CDWorker
+        }
+        auto layer_dealer=make_shared<Dealer>(nSocket++);
+        auto param_dealer=make_shared<Dealer>(nSocket++);
+        layer_dealer->Connect(kInprocRouterEndpoint);
+        param_dealer->Connect(kInprocRouterEndpoint);
+        worker->Setup(mproto, train_net, shard, layer_dealer, param_dealer);
+        worker->set_test_net(test_net);
+        worker->set_validation_net(validation_net);
+        workers.push_back(worker);
+      }
+    }
+  }
+
+#ifdef USE_MPI
+  for(int i=0;i<nSocket;i++){
+    MPIQueues.push_back(make_shared<SafeQueue>());
+  }
+#endif
+  vector<std::thread> threads;
+  for(auto server: servers)
+    threads.push_back(std::thread(&Server::Run,server));
+  for(auto worker: workers)
+    threads.push_back(std::thread(&Worker::Run,worker));
+  Run();
+  for(auto& thread: threads)
+    thread.join();
+}
+
+void Trainer::Run(){
+  auto cluster=Cluster::Get();
+  auto router=make_shared<Router>();
+  router->Bind(kInprocRouterEndpoint);
+  if(cluster->nprocs()>1)
+    router->Bind(cluster->endpoint());
+
+  map<int, shared_ptr<Dealer>> interprocs_dealers;
+  Poller poller;
+  poller.Add(router.get());
+  int timeout=cluster->stub_timeout();
+  while(true){
+    Msg* msg=router->Receive();
+    if(msg==nullptr){
+      LOG(ERROR)<<"Connection broken!";
+      exit(0);
+    }
+    int dst_flag=msg->dst_flag();
+    int type=msg->type();
+    int group_id, id, procs_id;
+    switch (dst_flag){ // TODO process other requests, e.g. RESTful
+      case kStub:
+        if(type==kConnect){
+          delete msg;
+        }else{
+          // TODO processing requests for worker group spanning multiple procs.
+          LOG(ERROR)<<"Unkown message type ("<<type<<") to stub";
+        }
+        break;
+      default:
+        group_id=msg->dst_group_id();
+        id=msg->dst_id();
+        procs_id=ProcsIDOf(group_id, id, dst_flag);
+        if(procs_id!=cluster->procs_id()){
+          if (interprocs_dealers.find(procs_id)==interprocs_dealers.end())
+            interprocs_dealers[procs_id]=make_shared<Dealer>(procs_id);
+          interprocs_dealers[procs_id]->Send(msg);
+        } else
+          router->Send(msg);
+        break;
+    }
+  }
+}
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
new file mode 100644
index 0000000..047ec2d
--- /dev/null
+++ b/src/trainer/worker.cc
@@ -0,0 +1,299 @@
+#include <glog/logging.h>
+#include <thread>
+#include <memory>
+#include <iostream>
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "trainer/worker.h"
+#include "proto/model.pb.h"
+using std::thread;
+namespace singa {
+Worker::Worker( int group_id, int worker_id):
+   group_id_(group_id), worker_id_(worker_id){
+}
+
+void Worker::Setup(const ModelProto& model,
+    shared_ptr<NeuralNet> train_net,
+    shared_ptr<PMWorker::ParamShard> shard,
+    shared_ptr<Dealer> layer_dealer,
+    shared_ptr<Dealer> param_dealer){
+  train_net_=train_net;
+  modelproto_=model;
+  layer_dealer_=layer_dealer;
+  param_dealer_=param_dealer;
+  if(layer_dealer_!=nullptr)
+    layer_poller_.Add(layer_dealer_.get());
+  if(param_dealer_!=nullptr)
+    param_poller_.Add(param_dealer_.get());
+  pmworker_=shared_ptr<PMWorker>(Singleton<Factory<PMWorker>>::Instance()
+      ->Create("PMWorker"));
+  pmworker_->Setup(group_id_, worker_id_, shard);
+  step_=modelproto_.step();
+  // init params
+  for(auto layer: train_net->layers())
+    if(group_id_==0&&layer->locationid()==worker_id_)
+      for(auto param: layer->GetParams()){
+        if(param->owner()<0||param->owner()==param->id()){
+          param->Init();
+          Put(param, step_);
+        }
+        Get(param, step_);
+      }
+}
+
+void Worker::Run(){
+  step_=modelproto_.step();
+  Performance perf(train_net_);
+  try{
+    while(!StopNow(step_)){
+      RunOneBatch(step_, &perf);
+      step_++;
+    }
+  }catch(WorkerException& e){
+    LOG(ERROR)<<e.what();
+  }
+}
+int Worker::Put(shared_ptr<Param> param, int step){
+  auto msg=pmworker_->Put(param, step);
+  if(msg!=nullptr)
+    param_dealer_->Send(msg);
+  return 1;
+}
+int Worker::Get(shared_ptr<Param> param, int step){
+  if(param->version()<step){
+    auto msg=pmworker_->Get(param, step);
+    if(msg!=nullptr)
+      param_dealer_->Send(msg);
+  }
+  return 1;
+}
+int Worker::Update(shared_ptr<Param> param, int step){
+  auto msg=pmworker_->Update(param, step);
+  if(msg!=nullptr)
+    param_dealer_->Send(msg);
+  return 1;
+}
+int Worker::Collect(shared_ptr<Param> param, int step){
+  while(param->version()<step){
+    Msg* msg=param_dealer_->Receive();
+    if(msg==nullptr)
+      return 0;
+    pmworker_->Collect(&msg);
+  }
+  return 1;
+}
+
+void Worker::RunOneBatch(int step, Performance* perf){
+  //DLOG(ERROR)<<"Step "<<step;
+  // Test will call Pull which updates the sync time
+  // Hence we store the sync time, and restore it later
+  //float tSyncData=tSyncData_, tSyncParam=tSyncParam_;
+  if(ValidateNow(step)){
+    LOG(ERROR)<<"Validation at step "<<step;
+    Test(validation_net_, modelproto_.validation_steps(), perf!=nullptr);
+  }
+  if(TestNow(step)){
+    LOG(ERROR)<<"Test at step "<<step;
+    Test(test_net_, modelproto_.test_steps(), perf!=nullptr);
+  }
+  //tSyncData_=tSyncData; tSyncParam_=tSyncParam;
+
+  TrainOneBatch(step);
+  if(perf!=nullptr){
+    perf->Update();
+    if(DisplayNow(step)){
+      LOG(ERROR)<<"Training at step "<<step;
+      LOG(ERROR)<<"\t"<<perf->ToString();
+      perf->Reset();
+      //LOG(ERROR)<<"\t"<<TimerInfo();
+    }
+  }
+
+  /*
+  if(CheckpointNow(step)){
+    pm_->Checkpoint(cluster_->workspace()+"/snapshot-"+std::to_string(step));
+  }
+  */
+}
+
+void Worker::ReceiveBlobs(shared_ptr<NeuralNet> net){
+  /*
+  int type;
+  char *name;
+  int64_t tick=zclock_mono();
+  zframe_t* frame=zframe_new_empty();
+
+  zsock_recv(pull_, "isf", &type, &name, &frame);
+  if(type==kDataFrame){
+    auto* dst=static_cast<BridgeDstLayer*>(
+        net->name2layer(string(name)).get());
+    memcpy(dst->mutable_data()->mutable_cpu_data(), zframe_data(frame),
+        zframe_size(frame));
+    dst->set_ready(true);
+  }else if(type==kGradFrame){
+    auto* src=static_cast<BridgeSrcLayer*>(net->name2layer(string(name)).get());
+    memcpy(src->mutable_grad()->mutable_cpu_data(), zframe_data(frame),
+        zframe_size(frame));
+    src->set_ready(true);
+  }
+  zframe_destroy(&frame);
+  delete name;
+  tSyncData_+=zclock_mono()-tick;
+  */
+}
+
+void Worker::SendBlob(){
+
+}
+
+void Worker::Test(shared_ptr<NeuralNet> net, int nsteps, bool disperf){
+  Performance perf(net);
+  for(int step=0;step<nsteps;step++){
+    TestOneBatch(net, step, kTest);
+    if(disperf)
+      perf.Update();
+  }
+  if(disperf)
+    LOG(ERROR)<<"\t"<<perf.ToString();
+}
+
+/****************************BPWorker**********************************/
+
+void BPWorker::Forward(shared_ptr<NeuralNet> net, int step,  bool training){
+  auto& layers=net->layers();
+  for(auto& layer: layers){
+    if(layer->locationid()==worker_id_){
+      if(layer->is_bridgedstlayer()){
+        //auto* dst=static_cast<BridgeDstLayer*>(layer.get());
+        // receive fea blobs
+      }
+      if(training){
+        for(shared_ptr<Param> p: layer->GetParams()){
+          if(Collect(p, step)==0){
+            throw WorkerException();
+          }
+        }
+      }
+      layer->ComputeFeature(training);
+      if(layer->is_bridgesrclayer()){
+        // send fea blobs
+      }
+      if(training&&DisplayDebugInfo(step)&&layer->mutable_data()!=nullptr){
+        LOG(INFO)<<StringPrintf("Forward layer  %10s data norm1 %13.9f",
+            layer->name().c_str(), layer->data().asum_data());
+      }
+    }
+  }
+}
+
+void BPWorker::Backward(shared_ptr<NeuralNet> net, int step){
+  auto& layers=net->layers();
+  for (auto it = layers.rbegin(); it != layers.rend(); it++){
+    shared_ptr<Layer> layer=*it;
+    if(layer->locationid()==worker_id_){
+      if(layer->is_bridgesrclayer()){
+        //auto* src=static_cast<BridgeSrcLayer*>(layer.get());
+        // receive grad blobs
+      }
+      layer->ComputeGradient();
+      if(DisplayDebugInfo(step)&&layer->mutable_grad()!=nullptr){
+        LOG(INFO)<<StringPrintf("Backward layer %10s grad norm1 %13.9f\t",
+            layer->name().c_str(), layer->grad().asum_data());
+        for(shared_ptr<Param> p: layer->GetParams())
+          LOG(INFO)<<StringPrintf("param id %2d, name %10s,\
+              value norm1 %13.9f, grad norm1 %13.9f",
+              p->id(), p->name().c_str(),
+              p->data().asum_data(), p->grad().asum_data());
+      }
+      for(shared_ptr<Param> p: layer->GetParams()){
+        Update(p, step);
+      }
+      if(layer->is_bridgedstlayer()){
+        // send grad blobs
+      }
+    }
+  }
+}
+
+void BPWorker::TrainOneBatch(int step){
+  Forward(train_net_, step, true);
+  Backward(train_net_, step);
+}
+
+void BPWorker::TestOneBatch(shared_ptr<NeuralNet> net,int step, Phase phase){
+  Forward(net, step, false);
+}
+
+/*********************Implementation for Performance class*******************/
+Performance::Performance(shared_ptr<NeuralNet> net):net_(net), counter_(0){
+  for(auto& layer: net->losslayers()){
+    name_.push_back(layer->name());
+    metric_.push_back(vector<float>{});
+    metric_.back().resize(layer->metric().count(),0.f);
+  }
+}
+
+void Performance::Update(){
+  const auto& losslayers=net_->losslayers();
+  for(size_t i=0;i<losslayers.size();i++){
+    const float * ptr=losslayers[i]->metric().cpu_data();
+    vector<float>& m=metric_.at(i);
+    for(int j=0;j<losslayers[i]->metric().count();j++)
+      m[j]+=ptr[j];
+  }
+  counter_++;
+}
+
+void Performance::Reset(){
+  for(auto& m: metric_)
+    for(auto& x: m)
+      x=0.f;
+  counter_=0;
+}
+
+string Performance::ToString(){
+  string disp="";
+  for(size_t i=0;i<metric_.size();i++){
+    disp+="Output from "+name_[i]+" layer ";
+    vector<float> m=metric_.at(i);
+    for(size_t j=0;j<m.size();j++)
+        disp+=std::to_string(j)+" : "+std::to_string(m[j]/counter_)+"\t";
+    disp+="\n";
+  }
+  return disp;
+}
+/*
+void Executor::Setup(int local_threadid, const ModelProto& model){
+  tForward_=tBackward_=tSyncData_=tSyncParam_=0;
+  modelproto_=model;
+  local_threadid_=local_threadid;
+  if(model.prefetch()){
+    for(auto& layer: train_net_->datalayers()){
+      if(cluster_->group_threadid(local_threadid_)==layer->locationid())
+        localDataLayers_.push_back(layer);
+    }
+    if(localDataLayers_.size())
+      prefetch_thread_=std::thread(Executor::PrefetchData,
+          std::ref(localDataLayers_), true,1);
+  }
+  int gthreadid=cluster_->group_threadid(local_threadid);
+}
+
+void Executor::PrefetchData(const vector<DataLayer*>& datalayers, bool training,
+    int steps){
+  if(datalayers.size()==0)
+    return;
+  for(int i=0;i<steps;i++){
+    for(auto& layer: datalayers){
+      layer->Prefetching(training);
+      for(auto& dstlayer: layer->dstlayers()){
+        CHECK(dstlayer->is_parserlayer());
+        auto parserlayer=static_cast<ParserLayer*>(dstlayer.get());
+        parserlayer->Prefetching(training);
+      }
+    }
+  }
+}
+*/
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/src/utils/blob.cc
----------------------------------------------------------------------
diff --git a/src/utils/blob.cc b/src/utils/blob.cc
new file mode 100644
index 0000000..92fc989
--- /dev/null
+++ b/src/utils/blob.cc
@@ -0,0 +1,330 @@
+/**
+ * The code is adapted from that of Caffe whose license is attached.
+ *
+ * COPYRIGHT
+ * All contributions by the University of California:
+ * Copyright (c) 2014, The Regents of the University of California (Regents)
+ * All rights reserved.
+ * All other contributions:
+ * Copyright (c) 2014, the respective contributors
+ * All rights reserved.
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * LICENSE
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * CONTRIBUTION AGREEMENT
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ */
+#include <utility>
+#include <math.h>
+#include <cblas.h>
+#include "utils/blob.h"
+/*********************SyncedMemory implementation************************/
+
+#define NO_GPU LOG(FATAL) << "CPU-only Mode: cannot make GPU call."
+// Instantiate a class with float and double specifications.
+#define INSTANTIATE_CLASS(classname) \
+  template class classname<float>; \
+  template class classname<double>
+// Disable the copy and assignment operator for a class.
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+private:\
+  classname(const classname&);\
+  classname& operator=(const classname&)
+
+#ifndef CPU_ONLY
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << caffe::cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << caffe::curandGetErrorString(status); \
+  } while (0)
+
+#endif // CPU_ONLY
+
+
+SyncedMemory::~SyncedMemory() {
+  if (cpu_ptr_ && own_cpu_data_) {
+    FreeHost(cpu_ptr_);
+  }
+
+#ifndef CPU_ONLY
+  if (gpu_ptr_) {
+    CUDA_CHECK(cudaFree(gpu_ptr_));
+  }
+#endif  // CPU_ONLY
+}
+
+inline void SyncedMemory::to_cpu() {
+  switch (head_) {
+  case UNINITIALIZED:
+    MallocHost(&cpu_ptr_, size_);
+    memset(cpu_ptr_,0, size_);
+    head_ = HEAD_AT_CPU;
+    own_cpu_data_ = true;
+    break;
+  case HEAD_AT_GPU:
+#ifndef CPU_ONLY
+    if (cpu_ptr_ == NULL) {
+      MallocHost(&cpu_ptr_, size_);
+      own_cpu_data_ = true;
+    }
+    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDefault));
+    head_ = SYNCED;
+#else
+    NO_GPU;
+#endif
+    break;
+  case HEAD_AT_CPU:
+  case SYNCED:
+    break;
+  }
+}
+
+inline void SyncedMemory::to_gpu() {
+#ifndef CPU_ONLY
+  switch (head_) {
+  case UNINITIALIZED:
+    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+    CUDA_CHECK(cudaMemset(gpu_ptr_, 0, N));  // NOLINT(caffe/alt_fn)
+    head_ = HEAD_AT_GPU;
+    break;
+  case HEAD_AT_CPU:
+    if (gpu_ptr_ == NULL) {
+      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+    }
+    CUDA_CHECK(cudaMemcpy( gpu_ptr_,cpu_ptr_, size_, cudaMemcpyDefault));
+    head_ = SYNCED;
+    break;
+  case HEAD_AT_GPU:
+  case SYNCED:
+    break;
+  }
+#else
+  NO_GPU;
+#endif
+}
+
+const void* SyncedMemory::cpu_data() {
+  to_cpu();
+  return (const void*)cpu_ptr_;
+}
+
+void SyncedMemory::set_cpu_data(void* data) {
+  CHECK(data);
+  if (own_cpu_data_) {
+    FreeHost(cpu_ptr_);
+  }
+  cpu_ptr_ = data;
+  head_ = HEAD_AT_CPU;
+  own_cpu_data_ = false;
+}
+
+const void* SyncedMemory::gpu_data() {
+#ifndef CPU_ONLY
+  to_gpu();
+  return (const void*)gpu_ptr_;
+#else
+  NO_GPU;
+#endif
+  return nullptr;
+}
+
+void* SyncedMemory::mutable_cpu_data() {
+  to_cpu();
+  head_ = HEAD_AT_CPU;
+  return cpu_ptr_;
+}
+
+void* SyncedMemory::mutable_gpu_data() {
+#ifndef CPU_ONLY
+  to_gpu();
+  head_ = HEAD_AT_GPU;
+  return gpu_ptr_;
+#else
+  NO_GPU;
+#endif
+  return nullptr;
+}
+
+/*********************Blob implementation************************/
+
+template <typename Dtype>
+Blob<Dtype>::Blob(const vector<int>& shape)
+  // capacity_ must be initialized before calling Reshape
+  : capacity_(0) {
+  Reshape(shape);
+}
+
+template <typename Dtype>
+void Blob<Dtype>::Reshape(const vector<int>& shape) {
+  count_=1;
+  shape_=shape;
+  for(size_t i=0;i<shape.size();i++){
+    CHECK(shape[i]);
+    count_*=shape[i];
+  }
+  if (count_ > capacity_) {
+    capacity_ = count_;
+    data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
+  }
+}
+
+template <typename Dtype>
+void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
+  Reshape(other.shape());
+}
+
+template <typename Dtype>
+const Dtype* Blob<Dtype>::cpu_data() const {
+  CHECK(data_);
+  return (const Dtype*)data_->cpu_data();
+}
+
+template <typename Dtype>
+void Blob<Dtype>::set_cpu_data(Dtype* data) {
+  CHECK(data);
+  data_->set_cpu_data(data);
+}
+
+template <typename Dtype>
+const Dtype* Blob<Dtype>::gpu_data() const {
+  CHECK(data_);
+  return (const Dtype*)data_->gpu_data();
+}
+
+template <typename Dtype>
+Dtype* Blob<Dtype>::mutable_cpu_data() {
+  CHECK(data_);
+  return static_cast<Dtype*>(data_->mutable_cpu_data());
+}
+
+template <typename Dtype>
+Dtype* Blob<Dtype>::mutable_gpu_data() {
+  CHECK(data_);
+  return static_cast<Dtype*>(data_->mutable_gpu_data());
+}
+
+template <typename Dtype>
+void Blob<Dtype>::ShareData(const Blob& other) {
+  CHECK_EQ(count_, other.count());
+  data_ = other.data();
+}
+
+template <> float Blob<float>::asum_data() const {
+  if(count()==0)
+    return 0.f;
+  return cblas_sasum(count(), cpu_data(), 1)/count();
+}
+template <> float Blob<float>::sum_data() const {
+  if(count()==0)
+    return 0.f;
+  float sum=0.f;
+  const float *dptr=cpu_data();
+  for(int i=0;i<count();i++)
+    sum+=dptr[i];
+  return sum/count();
+}
+template <> unsigned int Blob<unsigned int>::asum_data() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <> int Blob<int>::asum_data() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <typename Dtype>
+void Blob<Dtype>::Swap(Blob& other){
+  CHECK_EQ(other.count(), count());
+  CHECK(std::equal(shape_.begin(), shape_.end(), other.shape_.begin()));
+  std::swap(data_, other.data_);
+  std::swap(capacity_, other.capacity_);
+}
+
+template <typename Dtype>
+void Blob<Dtype>::CopyFrom(const Blob& source, bool reshape) {
+  if (!std::equal(shape_.begin(),shape_.end(),source.shape_.begin())) {
+    if (reshape) {
+      Reshape(source.shape_);
+    } else {
+      LOG(FATAL) << "Trying to copy blobs of different sizes.";
+    }
+  }
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(data_->mutable_gpu_data()),
+            source.gpu_data(), sizeof(Dtype) * count_, cudaMemcpyDefault));
+#endif
+  memcpy(static_cast<Dtype*>(data_->mutable_cpu_data()),source.cpu_data(),
+        sizeof(Dtype)*count_);
+}
+
+/*
+template <typename Dtype>
+void Blob<Dtype>::FromProto(const BlobProto& proto) {
+  Reshape();
+  // copy data
+  Dtype* data_vec = mutable_cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    data_vec[i] = proto.data(i);
+  }
+}
+*/
+
+template <typename Dtype>
+void Blob<Dtype>::ToProto(singa::BlobProto* proto) const {
+  proto->set_num(shape_[0]);
+  if(shape_.size()>1)
+    proto->set_channels(shape_[1]);
+  if(shape_.size()>2)
+    proto->set_height(shape_[2]);
+  if(shape_.size()>3)
+    proto->set_width(shape_[3]);
+  proto->clear_data();
+  const Dtype* data_vec = cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    proto->add_data(data_vec[i]);
+  }
+}
+
+INSTANTIATE_CLASS(Blob);
+template class Blob<int>;
+template class Blob<unsigned int>;


[08/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_engine-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr_engine-inl.hpp b/include/mshadow/tensor_expr_engine-inl.hpp
new file mode 100644
index 0000000..9c5f2c7
--- /dev/null
+++ b/include/mshadow/tensor_expr_engine-inl.hpp
@@ -0,0 +1,416 @@
+#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
+#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
+/*!
+ * \file tensor_expr_engine-inl.hpp
+ * \brief definitions of how expressions should be evaluated
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_expr.h"
+#include "tensor.h"
+
+namespace mshadow{
+    namespace expr{
+        /*! 
+         * \brief a general class that allows extension that makes tensors of some shape
+         * \tparam SubType type of subclass
+         * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
+         * \tparam dim dimension of the expression
+         */
+        template<typename SubType, typename SrcExp, int dim>
+        struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{
+            /*! \brief the shape of this expression */
+            Shape<dim> shape_;
+            /*! \brief true self of subtype */
+            inline const SubType& real_self( void ) const{
+                return *static_cast<const SubType*>(this);
+            }
+        };
+    };
+    
+    namespace expr{
+        /*! \brief This part of code gives plan that can be used to carry out execution */
+        template<typename ExpType>
+        class Plan{
+        public:
+            /*!
+             * \brief evaluate the expression at index [y][x]
+             *        to be implemented by SubType
+             */
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
+        };
+
+        template <typename Device, int dim>
+        class Plan< Tensor<Device,dim> >{
+        public:
+            Plan( const Tensor<Device,dim> &t )
+                :dptr_(t.dptr),stride_(t.shape.stride_){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ y * stride_ + x ];
+            }
+        private:
+            const real_t  *dptr_;
+            index_t stride_;
+        };
+        // special evaluation case for 1d tensor
+        template <typename Device>
+        class Plan< Tensor<Device,1> >{
+        public:
+            Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t  *dptr_;
+        };
+        
+        template<>
+        class Plan<ScalarExp>{
+        public:
+            Plan( real_t scalar ):scalar_(scalar){}
+            /*! \brief evaluate at [y][x] */
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                    return scalar_;
+            }
+        private:
+            real_t scalar_;
+        };
+
+        template<typename OP, typename TA, typename TB,int etype>
+        class Plan< BinaryMapExp<OP,TA,TB,etype> >{
+        public:
+            Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
+            }
+        private:
+            Plan<TA> lhs_;
+            Plan<TB> rhs_;
+        };
+
+        template<typename OP, typename TA, int etype>
+        class Plan< UnaryMapExp<OP,TA,etype> >{
+        public:
+            Plan( const Plan<TA> &src ):src_(src){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( src_.Eval( y, x ) );
+            }
+        private:
+            Plan<TA> src_;
+        };
+
+        
+        template<typename SubType, typename SrcExp, int dim>
+        struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
+        public:
+            Plan( const Plan<SubType> &src ):src_(src){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return src_.Eval( y, x );
+            }
+        private:
+            Plan<SubType> src_;  
+        };
+
+        // allow UnaryMap see the plan
+        template<typename OP, typename TA, typename TB, int etype>
+        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e );
+
+        // translate from exp to execution plan
+        inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
+            return Plan<ScalarExp>( e.scalar_ );
+        }
+
+        template<typename T>
+        inline Plan<T> MakePlan( const ContainerExp<T> &e ){
+            return Plan<T>( e.self() );
+        }
+
+        template<typename T, typename SrcExp, int dim>
+        inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
+            return Plan< T >( e.real_self() );
+        }
+
+        template<typename OP, typename TA, int etype>
+        inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){
+            return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
+        }
+
+        template<typename OP, typename TA, typename TB, int etype>
+        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
+            return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) );
+        }
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief static type inference template, 
+         *        used to get the dimension of each expression, 
+         *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
+         *        if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu
+         * \tparam E expression
+         */
+        template<typename E>
+        struct ExpInfo{
+            const static int kDim = -1;
+            const static int kDevMask = 0;
+        };
+        template<>
+        struct ExpInfo<ScalarExp>{
+            const static int kDim = 0;
+            const static int kDevMask = 0xffff;
+        };
+        template<typename Device, int dim>
+        struct ExpInfo< Tensor<Device,dim> >{
+            const static int kDim = dim;
+            const static int kDevMask = Device::kDevMask;            
+        };
+        template<typename T, typename SrcExp, int dim>
+        struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
+            const static int kDimSrc = ExpInfo<SrcExp>::kDim;
+            const static int kDim = kDimSrc >= 0 ? dim : -1;
+            const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
+        };
+        template<typename OP, typename TA, int etype>
+        struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
+            const static int kDim = ExpInfo<TA>::kDim;
+            const static int kDevMask = ExpInfo<TA>::kDevMask;
+        };
+        template<typename OP, typename TA, typename TB, int etype>
+        struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
+            const static int kDimLhs = ExpInfo<TA>::kDim;
+            const static int kDimRhs = ExpInfo<TB>::kDim;
+            const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
+                ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1;
+            const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
+        };
+
+        /*! \brief template to do type check */
+        template<typename Device, int dim, typename E>
+        struct TypeCheck{
+            /*! \brief dimension of expression*/
+            const static int kExpDim = ExpInfo<E>::kDim;
+            /*! \brief whether the expression device type matches */
+            const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
+            /*! \brief whether the expression can be mapped to expression of dim */
+            const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
+            /*! \brief whether the expression can be reduced to expression of dim */
+            const static bool kRedPass = (kExpDim > dim) && kDevPass;
+        };
+
+        template<bool kPass>
+        struct TypeCheckPass;
+        template<>
+        struct TypeCheckPass<false>{};
+        template<>
+        struct TypeCheckPass<true>{
+            inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){}
+            inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){}
+            inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){}
+        };
+    }; // namespace expr
+    
+    namespace expr{
+        // check shape consistency
+        template<int dim,typename E>
+        struct ShapeCheck{
+            inline static Shape<dim> Check( const E &t );
+        };
+        
+        template<int dim>
+        struct ShapeCheck<dim,ScalarExp>{
+            inline static Shape<dim> Check( const ScalarExp &exp ){
+                // use lowest dimension to mark scalar exp
+                Shape<dim> shape; shape[0] = 0; 
+                return shape;
+            }
+        };
+        template<int dim,typename Device>
+        struct ShapeCheck<dim,Tensor<Device,dim> >{
+            inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
+                return t.shape;
+            }
+        };
+        template<int dim,typename SrcExp,typename T>
+        struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
+            inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){
+                return t.shape_;
+            }
+        };
+        template<int dim, typename OP, typename TA, int etype>
+        struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
+            inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){
+                Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
+                return s;
+            }
+        };
+        template<int dim, typename OP, typename TA, typename TB, int etype>
+        struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
+            inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
+                Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
+                Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
+                if( shape1[0] == 0 ) return shape2;
+                if( shape2[0] == 0 ) return shape1;
+                utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same");
+                return shape1;
+            }
+        };
+    }; // namespace expr
+
+    // the matrix OP depends on BLAS
+    namespace expr{
+        template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans>
+        struct DotEngine{
+            inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
+        };
+
+        // handles the dot
+        template<typename Device>
+        struct BLASEngine;
+
+        #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL)
+        template<>
+        struct BLASEngine<cpu>{
+            inline static CBLAS_TRANSPOSE GetT( bool t ){
+                return t ? CblasTrans : CblasNoTrans;
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \
+                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
+                cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \
+                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
+                cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
+                                     const float *X, int incX, float beta, float *Y, int incY ){
+                cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
+                                     const double *X, int incX, double beta, double *Y, int incY ){
+                cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
+                cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
+                cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+        };
+        #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
+
+        #if MSHADOW_USE_CUDA
+        // All CuBLAS goes to here, use legacy API: not threadsafe
+        template<>
+        struct BLASEngine<gpu>{
+            inline static char GetT( bool t ){
+                return t ? 'T' : 'N';
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, 
+                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
+                cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, 
+                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
+                cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);                
+            }
+            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
+                                     const float *X, int incX, float beta, float *Y, int incY ){
+                cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
+                                     const double *X, int incX, double beta, double *Y, int incY ){
+                cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
+                cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
+                cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+        };
+        #endif
+
+        // helper function to decide which shape we are in 
+        inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){
+            return transpose ? Shape2(shape[0],shape[1]) : shape;
+        }
+        // dst = dot( lhs[.T], rhs[.T] )
+        template<typename SV, typename xpu, bool transpose_left, bool transpose_right>
+        struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
+            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
+                Shape<2> sleft  = GetShape( lhs.shape, transpose_left );
+                Shape<2> sright = GetShape( rhs.shape, transpose_right );
+                utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \
+                               && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" );
+                // use column major argument to compatible with most BLAS
+                BLASEngine<xpu>::gemm
+                    ( transpose_right , transpose_left,
+                      transpose_right ? rhs.shape[1] : rhs.shape[0],
+                      transpose_left  ? lhs.shape[0] : lhs.shape[1],
+                      transpose_right ? rhs.shape[0] : rhs.shape[1], 
+                      scale * SV::kAlphaBLAS, 
+                      rhs.dptr, rhs.shape.stride_,
+                      lhs.dptr, lhs.shape.stride_,
+                      SV::kBetaBLAS, 
+                      dst.dptr, dst.shape.stride_ );
+            }
+        };
+        template<typename SV, typename xpu, bool transpose_right>
+        struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
+            inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
+                Shape<2> sright = GetShape( rhs.shape, transpose_right );
+                utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch");
+                BLASEngine<xpu>::gemv
+                    ( transpose_right, 
+                      rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
+                      rhs.dptr, rhs.shape.stride_,
+                      lhs.dptr, 1, SV::kBetaBLAS,
+                      dst.dptr, 1 );
+            }
+        };        
+        template<typename SV, typename xpu>
+        struct DotEngine<SV,xpu,2,1,1,true,false>{
+            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
+                utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" );
+                if( SV::kBetaBLAS < 1e-6f ){
+                    BLASEngine<xpu>::ger
+                        ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
+                          rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ );
+                }else{
+                    DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
+                }
+            }
+        };
+
+    }; // namespace expr
+
+    namespace expr{
+        /*! \brief some engine that evaluate complex expression */
+        template<typename SV, typename Device, int dim, typename E>
+        struct ExpComplexEngine{
+            inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
+        };
+        template<typename SV, typename Device, int dim>
+        struct ExpEngine<SV, Tensor<Device,dim> >{
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){
+                MapExp<SV,dim,E>( dst, exp );
+            }
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){
+                MapExp<SV,dim,E>( dst, exp );
+            }
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){
+                ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
+            }
+        };
+        template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans>
+        struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{
+            inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
+                DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ );
+            }
+        };
+    }; // namespace expr
+};
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_ext.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr_ext.h b/include/mshadow/tensor_expr_ext.h
new file mode 100644
index 0000000..8399b1b
--- /dev/null
+++ b/include/mshadow/tensor_expr_ext.h
@@ -0,0 +1,978 @@
+#ifndef MSHADOW_TENSOR_EXPR_EXT_H
+#define MSHADOW_TENSOR_EXPR_EXT_H
+/*!
+ * \file tensor_expr_ext.h
+ * \brief some extension of expressions, used to support something beyond elementwise op
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_expr_engine-inl.hpp"
+namespace mshadow{
+    // Declaration of expressions goes here
+    namespace expr{
+        /*!
+         * \brief broadcast Tensor1D into a higher dimension Tensor
+         * input: Tensor<Device,1>: ishape[0]
+         * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
+         * \tparam Device which device it lies
+         * \tparam dimdst  target tensor dimension
+         * \tparam dimcast the dimension where the 1D tensor fills in by index
+         */
+        template<typename Device, int dimdst, int dimcast>
+        struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp<Device,dimdst,dimcast>,Tensor<Device,1>,dimdst>{
+            /*! \brief source operand */
+            const Tensor<Device,1> src_;
+            /*! \brief constructor */
+            Broadcast1DExp( const Tensor<Device,1> &src, Shape<dimdst> shape ):src_(src){
+                this->shape_ = shape;
+            }
+        };
+
+        /*!
+         * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch        
+         *  this is a version support unpacking multiple images
+         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
+         * \tparam SrcExp source expression
+         * \tparam dstdim destination dimension
+         */
+        template<typename SrcExp, int srcdim>
+        struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp<SrcExp,srcdim>, SrcExp, 2>{
+            /*! \brief source operand */
+            const SrcExp& img_;
+            /*! \brief patch size */
+            index_t psize_;
+            /*! \brief patch stride */
+            index_t pstride_;
+            /*! \brief number of input channel */
+            index_t i_channel_;
+            /*! \brief height of img */
+            index_t i_height_;
+            /*! \brief width of img */
+            index_t i_width_;            
+            /*! \brief constructor */
+            UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride )
+                :img_(img), psize_(psize), pstride_(pstride){
+                Shape<srcdim> imshape = ShapeCheck<srcdim,SrcExp>::Check( img_ );
+                utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size");
+                this->i_channel_ = imshape[2];
+                this->i_height_  = imshape[1];
+                this->i_width_   = imshape[0];
+                // calculate number of batches 
+                const index_t num = imshape.ProdShape( 3, srcdim );
+                const index_t o_height = ( i_height_ - psize ) / pstride + 1;
+                const index_t o_width  = ( i_width_  - psize ) / pstride + 1;
+                this->shape_[0] = o_height * o_width * num;
+                this->shape_[1] = psize * psize * imshape[2];
+            }
+        };
+
+        /*!
+         * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back
+         *    this is a version supporting multiple images
+         * \tparam Device which device it lies
+         * \tparam dstdim destination dimension
+         */
+        template<typename Device, int dstdim>
+        struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp<Device,dstdim>, Tensor<Device,2>, dstdim>{
+            /*! \brief source operand */
+            const Tensor<Device,2>& mat_;
+            /*! \brief patch size */
+            index_t psize_;
+            /*! \brief patch stride */
+            index_t pstride_;
+            /*! \brief constructor */
+            PackColToPatchXExp( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride )
+                :mat_(mat), psize_(psize), pstride_(pstride){
+                this->shape_ = imshape;
+                const index_t o_height = ( imshape[1]  - psize ) / pstride + 1;                
+                const index_t o_width  = ( imshape[0]  - psize ) / pstride + 1;                
+                utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" );
+                utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" );
+            }
+        };
+
+        /*!
+         * \brief reshape the content to another shape
+         * input: Tensor<Device,dimsrc>: ishape
+         * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
+         * \tparam SrcExp source expression
+         * \tparam dimdst target dimension
+         * \tparam dimsrc source dimension
+         */
+        template<typename SrcExp, int dimdst, int dimsrc>
+        struct ReshapeExp: public MakeTensorExp< ReshapeExp<SrcExp,dimdst,dimsrc>, SrcExp, dimdst>{
+            /*! \brief source expression */
+            const SrcExp& src_;
+            /*! \brief smallest dimension of input */
+            index_t ishape0_;
+            /*! \brief constructor */
+            ReshapeExp( const SrcExp &src, Shape<dimdst> shape ):src_(src){
+                Shape<dimsrc> ishape = ShapeCheck<dimsrc,SrcExp>::Check( src_ );
+                utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" );
+                ishape0_ = ishape[0];
+                this->shape_ = shape;
+            }
+        };
+
+        /*!
+         * \brief swap two axis of a tensor
+         * input: Tensor<Device,dim>: ishape
+         * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
+         *
+         * \tparam SrcExp type of source expression
+         * \tparam dimsrc source dimension
+         * \tparam a1 smaller dimension to be swapped
+         * \tparam a2 larger dimension to be swapped
+         */
+        template<typename SrcExp,int dimsrc, int a1, int a2>
+        struct SwapAxisExp: public MakeTensorExp< SwapAxisExp<SrcExp,dimsrc,a1,a2>, SrcExp, dimsrc>{
+            /*! \brief source expression */
+            const SrcExp& src_;
+            /*! \brief constructor */
+            SwapAxisExp( const SrcExp &src ):src_(src){                
+                this->shape_ = ShapeCheck<dimsrc,SrcExp>::Check(src); 
+                std::swap( this->shape_[a1], this->shape_[a2] );
+            }
+        };
+
+        /*!
+         * \brief reduction to 1 dimension tensor
+         * input: Tensor<Device,k>: ishape
+         * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
+         *
+         * \tparam EType type of expression to be reduced
+         * \tparam Reducer which reducer to use
+         * \tparam srcdim dimension of source
+         * \tparam dimkeep which dimension to be kept,
+         */
+        template<typename EType, typename Reducer,int dimkeep>
+        struct ReduceTo1DExp: public Exp< ReduceTo1DExp<EType,Reducer, dimkeep>, type::kComplex >{
+            /*! \brief source operand */
+            const EType& src_;
+            /*! \brief source operand, scale of the  */
+            real_t scale_;
+            /*! \brief construct a repmat expression from src and nrow */
+            ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){}
+        };
+
+        /*!
+         * \brief pooling expression, do reduction over local patches of a image
+         * \tparam Reducer reduction method during pooling
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct PoolingExp: public MakeTensorExp< PoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief kernel size */
+            index_t ksize_;
+            /*! \brief kernel stride */
+            index_t kstride_;
+            /*! \brief source height shape[1] */
+            index_t src_height_;
+            /*! \brief source width shape[0] */
+            index_t src_width_;
+            /*! \brief constructor */
+            PoolingExp( const SrcExp &src, index_t ksize, index_t kstride )
+                : src_(src), ksize_(ksize), kstride_(kstride) {
+                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
+                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
+                this->src_height_ = sshape[1];
+                this->src_width_  = sshape[0];
+                this->shape_ = sshape;
+                this->shape_[1] =  (src_height_ - ksize) / kstride + 1;                
+                this->shape_[0] =  (src_width_  - ksize) / kstride + 1;
+            }
+            /*! \brief constructor, specify shape */
+            PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride )
+                : src_(src), ksize_(ksize), kstride_(kstride) {
+                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
+                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
+                this->src_height_ = sshape[1];
+                this->src_width_  = sshape[0];
+                this->shape_    = sshape;
+                this->shape_[1] = pshape[1];
+                this->shape_[0] = pshape[0];
+            } 
+        };
+
+        /*!
+         * \brief unpooling expr reverse operation of pooling, used to pass gradient back
+         * \tparam Reducer specifies reduction operation during pooling
+         * \tparam Device which device it lies
+         */
+        template<typename Reducer, typename Device>
+        struct UnPoolingExp: public MakeTensorExp< UnPoolingExp<Reducer, Device>, Tensor<Device,4>, 4> {
+            /*! \brief source input, corresponds to src in pooling */
+            const Tensor<Device, 4>& data_src_;
+            /*! \brief result of pooled data, corresponds to result of pooling */
+            const Tensor<Device, 4>& data_pooled_;
+            /*! \brief gradient data of pooled part, to be propgate down */
+            const Tensor<Device, 4>& grad_pooled_;
+            /*! \brief kernel size */
+            index_t ksize_;
+            /*! \brief kernel stride */
+            index_t kstride_;
+            /*! \brief constructor */
+            UnPoolingExp( const Tensor<Device,4> &data_src,  const Tensor<Device,4> &data_pooled,
+                          const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride )
+                : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled),
+                  ksize_(ksize), kstride_(kstride) {
+                utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" );
+                utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" );
+                utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" );
+                this->shape_ = data_src_.shape;
+            }
+        };
+
+        /*!
+         * \brief padding expression, pad a image with zeros
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct PaddingExp : public MakeTensorExp<PaddingExp<SrcExp, srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief pad size */
+            index_t pad_;
+            /*! \brief source tensor height */
+            index_t src_height_;
+            /*! \brief source tensor width */
+            index_t src_width_;
+            /*! \brief constructor */
+            PaddingExp( const SrcExp &src, index_t pad )
+                : src_(src), pad_(pad) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                src_height_ = this->shape_[1];
+                src_width_  = this->shape_[0];
+                this->shape_[1] += pad * 2; // height
+                this->shape_[0] += pad * 2; // width
+            }
+        };
+
+        /*!
+         * \brief crop expression, cut off the boundary region, reverse operation of padding
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct CroppingExp : public MakeTensorExp< CroppingExp<SrcExp, srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief pad height */
+            index_t pad_height_;
+            /*! \brief pad height */
+            index_t pad_width_;
+            /*! \brief src height */
+            index_t src_height_;
+            /*! \brief constructor */
+            CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
+                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
+                pad_height_ = (this->shape_[1] - cshape[1]) / 2;
+                pad_width_ = (this->shape_[0] - cshape[0]) / 2;
+                src_height_ = this->shape_[1];
+                this->shape_[1] = cshape[1]; // width
+                this->shape_[0] = cshape[0]; // height
+            }
+            /*! \brief constructor */
+            CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width  )
+                : src_(src), pad_height_(start_height), pad_width_(start_width) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
+                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
+                src_height_ = this->shape_[1];
+                this->shape_[1] = cshape[1]; // width
+                this->shape_[0] = cshape[0]; // height
+            }
+
+        }; // struct CroppingExp
+
+
+        /*!
+         * \brief mirror expression, mirror a image in width
+         * \tparam SrcExp source expression to be mirrored
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct MirroringExp : public MakeTensorExp<MirroringExp<SrcExp, srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief constructor */
+            MirroringExp( const SrcExp &src ): src_(src) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+            }
+        };
+
+        /*!
+         * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization
+         * \tparam Reducer reduction method during pooling
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief neighbor size */
+            index_t nsize_;            
+            /*! \brief constructor */
+            ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){
+                utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" );
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" );
+            }
+        };
+    }; // namespace expr
+
+
+    // Declaration of all functions go here
+    namespace expr{
+        /*! \brief operator overload */
+        template<typename E, typename R,int d>
+        inline ReduceTo1DExp<E,R,d> operator*( const ReduceTo1DExp<E,R,d> &e, real_t scale ){
+            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
+        }
+        /*! \brief operator overload */
+        template<typename E, typename R,int d>
+        inline ReduceTo1DExp<E,R,d> operator*( real_t scale, const ReduceTo1DExp<E,R,d> &e ){
+            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
+        }
+
+        /*!
+         * \brief a expression that replicate a 1 dimension tensor in dimension dimcast
+         * \param src Tensor<Device,1>: shape[0]
+         * \param shape shape of output
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam dimcast target dimension where the 1D tensor will be broadcasted
+         * \tparam Device which device it lies
+         * \tparam dimdst dimension of destination tensor
+         */
+        template<int dimcast,typename Device,int dimdst>
+        inline Broadcast1DExp<Device,dimdst,dimcast> broadcast( const Tensor<Device,1> &src, Shape<dimdst> shape ){
+            TypeCheckPass< dimcast<dimdst >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" );
+            return Broadcast1DExp<Device,dimdst,dimcast>( src, shape );
+        }
+
+        /*!
+         * \brief  unpack local (overlap) patches of image to column of mat, can be used to implement convolution
+         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
+         *
+         *  weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize
+         *  output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images
+         *  out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0
+         *  out_width  = ( in_width - psize ) / pstride + 1
+         *
+         * \return mat target matrix; shape[1]: in_channel*psize*psize  shape[0]: out_height*out_width * num_of_images
+         * \param img source image; shape[2]:  in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images)
+         * \param psize height and width of each patch
+         * \param pstride stride of each patch
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename SrcExp, int etype>
+        inline UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim > unpack_patch2col( const Exp<SrcExp,etype> &img, index_t psize, index_t pstride ){
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim >( img.self(), psize, pstride );
+        }
+
+        /*!
+         * \brief reverse operation of pack_col2patch, can be used to implement deconvolution
+         * \return packed img expression
+         * \param mat source matrix
+         * \param imshape shape of target img
+         * \param psize height and width of each patch
+         * \param pstride stride of each patch
+         * \tparam Device the Device where input data lies
+         */
+        template<typename Device, int dstdim>
+        inline PackColToPatchXExp<Device,dstdim> pack_col2patch( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ){
+            utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size");
+            return PackColToPatchXExp<Device,dstdim>( mat, imshape, psize, pstride );
+        }
+        /*!
+         * \brief a expression that reshapes a tensor to another shape
+         * \param src Tensor<Device,dimsrc>:
+         * \param oshape target shape
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam SrcExp source expression
+         * \tparam etype source expression type
+         * \tparam dimdst target dimension
+         */
+        template<typename SrcExp, int etype, int dimdst>
+        inline ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim > reshape( const Exp<SrcExp,etype> &src, Shape<dimdst> oshape ){
+            return ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim >( src.self(), oshape );
+        }
+
+        /*!
+         * \brief a expression that reshapes a tensor to another shape
+         * \param src Tensor<Device,dimsrc>:
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam a1 smaller dimension to be swapped
+         * \tparam a2 larger dimension to be swapped
+         * \tparam SrcExp source expression
+         * \tparam etype source expression type
+         */
+        template<int a1, int a2, typename SrcExp, int etype>
+        inline SwapAxisExp< SrcExp, ExpInfo<SrcExp>::kDim, a1,a2> swapaxis( const Exp<SrcExp,etype> &src ){ 
+            typedef ExpInfo<SrcExp> Info;
+            TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() );
+        }
+
+        /*!
+         * \brief a sum over all dimensions, except dimkeep
+         * \param exp input expression that must be a matrix Tensor<?,2>
+         * \return a expresion with type Tensor<Device,1>
+         * \tparam dimkeep the dimension that will be kept
+         * \tparam SrcExp expression
+         * \tparam etype type of expression
+         */
+        template<int dimkeep,  typename SrcExp, int etype>
+        inline ReduceTo1DExp<SrcExp, red::sum, dimkeep > sumall_except_dim( const Exp<SrcExp,etype> &exp ){
+            return ReduceTo1DExp<SrcExp,red::sum,dimkeep>( exp.self(), 1.0f );
+        }
+
+        /*!
+         * \brief pooling subregion results together
+         * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, index_t ksize, index_t kstride ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), ksize, kstride);
+        }
+        /*! 
+         * \brief same as pool, except the output shape is specified by pshape
+         * \param src source image
+         * \param pshape ouput shape 
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, Shape<2> pshape, index_t ksize, index_t kstride ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), pshape, ksize, kstride);
+        }
+        /*!
+         * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling
+         * \param data_src  source input, corresponds to src in pooling
+         * \param data_pooled result of pooled data, corresponds to result of pooling
+         * \param grad_pooled gradient data of pooled part, to be propgate down
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient
+         * \tparam Reducer reducer type
+         * \tparam Device device where data lies
+         */
+         template<typename Reducer, typename Device>
+         inline UnPoolingExp<Reducer, Device> unpool( const Tensor<Device,4>&data_src, const Tensor<Device,4> &data_pooled,
+                                                      const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride ) {
+             return UnPoolingExp<Reducer, Device>(data_src, data_pooled, grad_pooled,ksize, kstride);
+         }
+
+        /*!
+         * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
+         * \param src original image batches
+         * \param pad padding size
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim> pad(const Exp<SrcExp, etype> &src, index_t pad) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), pad);
+         }
+
+        /*!
+         * \brief revserse operationg of padding, cut off boundaries, crop output from center of input
+         * \param src original image batches
+         * \param oshape output shape to be cropped
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape ) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape);
+         }
+        /*!
+         * \brief same as crop, but can specify starting position to do cropping
+         * \param src original image batches
+         * \param oshape output shape to be cropped
+         * \param start_height start height position to do cropping
+         * \param start_width  start width position to do cropping
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape, index_t start_height, index_t start_width ) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape, start_height, start_width);
+         }
+
+        /*!
+         * \brief mirroring expression, mirror images in width
+         * \param src original image batches
+         * \return expression corresponding to mirrored result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim> mirror(const Exp<SrcExp, etype> &src) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self());
+         }
+
+        /*!
+         * \brief  channel pooling, do reduction over (local nearby) channels, used to implement local response normalization
+         * \param src source data 
+         * \param nsize neighbor size 
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > chpool( const Exp<SrcExp,etype> &src, index_t nsize ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(),nsize);
+        }
+        // short cut functions
+        /*!
+         * \brief a expression that replicate a 1 dimension tensor for nrow times
+         * \param src Tensor<Device,1>: shape[0]
+         * \param nrow number of rows to replicate
+         * \return a expresion with type Tensor<Device,2> shape[0], shape[1] = nrow
+         * \tparam Device which device it lies
+         */
+        template<typename Device>
+        inline Broadcast1DExp<Device,2,0> repmat( const Tensor<Device,1> &src, index_t nrow ){
+            return broadcast<0>( src, Shape2( nrow, src.shape[0] ) );
+        }
+        /*!
+         * \brief a expression that sum over rows of a matrix
+         * \param exp input expression that must be a matrix Tensor<?,2>
+         * \return a expresion with type Tensor<Device,1>
+         * \tparam SrcExp expression
+         * \tparam etype type of expression
+         */
+        template<typename SrcExp, int etype>
+        inline ReduceTo1DExp<SrcExp, red::sum, 0 > sum_rows( const Exp<SrcExp,etype> &exp ){
+            return sumall_except_dim<0>( exp );
+        }
+
+    }; // namespace expr
+}; // namespace mshadow
+
+// ==================================================
+//  implementations afterwards,
+//  no need to read if only use the functions
+// --------------------------------------------------
+namespace mshadow{
+    namespace expr{
+        template<typename SV, typename Device, typename EType, typename Reducer, int dimkeep>
+        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,dimkeep> >{
+            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,dimkeep> &exp ){
+                TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+                MapReduceKeepHighDim<SV,Reducer,dimkeep>( dst, exp.src_, exp.scale_ );
+            }
+        };
+
+        template<typename SV, typename Device, typename EType, typename Reducer>
+        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,0> >{
+            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,0> &exp ){
+                MapReduceKeepLowest<SV,Reducer>( dst, exp.src_, exp.scale_ );
+            }
+        };
+    }; // namespace expr
+
+    namespace expr{
+        /*! \brief execution plan of Broadcast1DExp */
+        template<typename Device, int dimdst, int dimcast>
+        struct Plan< Broadcast1DExp<Device,dimdst,dimcast> >{
+        public:
+            Plan( const Broadcast1DExp<Device,dimdst,dimcast> &e )
+                : dptr_( e.src_.dptr ), 
+                  ystride_( e.shape_.ProdShape(1,dimcast) ),
+                  length_(e.shape_[dimcast]){
+                TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ (y / ystride_) % length_ ];
+            }
+        private:
+            const real_t  *dptr_;
+            const index_t  ystride_, length_;
+        };
+
+        /*! \brief execution plan of Broadcast1DExp */
+        template<typename Device, int dimdst>
+        struct Plan< Broadcast1DExp<Device,dimdst,0> >{
+        public:
+            Plan( const Broadcast1DExp<Device,dimdst,0> &e ): dptr_( e.src_.dptr ){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t *dptr_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename SrcExp, int srcdim>
+        struct Plan< UnpackPatchToColXExp<SrcExp,srcdim> >{
+        public:
+            Plan( const UnpackPatchToColXExp<SrcExp,srcdim> &e )
+                :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_),
+                 i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_),                 
+                 o_height_(( i_height_  - psize_ ) / pstride_ + 1),
+                 o_width_ (( i_width_   - psize_ ) / pstride_ + 1){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                const index_t x_offset = i % psize_;
+                const index_t idivp    = i / psize_;
+                const index_t y_offset = idivp % psize_;
+                const index_t c = idivp / psize_;                
+                const index_t x = (j % o_width_) * pstride_ + x_offset;
+                const index_t jdivw = j / o_width_;
+                const index_t y = (jdivw % o_height_) * pstride_ + y_offset;
+                const index_t n = jdivw / o_height_;
+
+                if( x < i_width_ && y < i_height_ ){
+                    return src_.Eval( ( n * i_channel_  + c ) * i_height_ + y, x );
+                }else{
+                    return 0.0f;
+                }
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_;
+        };
+
+        template<typename Device, int dstdim>
+        struct Plan< PackColToPatchXExp<Device, dstdim> >{
+        public:
+            Plan( const PackColToPatchXExp<Device, dstdim> &e )
+                :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_),
+                 i_channel_(e.shape_[2]), i_height_(e.shape_[1]),
+                 o_width_(( e.shape_[0]  - psize_ ) / pstride_ + 1),
+                 o_height_(( e.shape_[1]  - psize_ ) / pstride_ + 1){
+                // note: i/o convention are same as unpack
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                using namespace std;
+                const index_t y = i % i_height_;
+                const index_t idivh = i / i_height_;                
+                const index_t c = idivh % i_channel_;
+                const index_t n = idivh / i_channel_; 
+                const index_t x = j;
+                const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_;
+                const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_;
+                const index_t py_max = min( (y+pstride_)/pstride_, o_height_);
+                const index_t px_max = min( (x+pstride_)/pstride_, o_width_ );
+                real_t res = 0.0f;
+                for( index_t py = py_min; py < py_max; ++py ){
+                    for( index_t px = px_min; px < px_max; ++px ){
+                        res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ];
+                    }
+                }
+                return res;
+            }
+        private:
+            Tensor<Device,2> mat_;
+            const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_;
+        };
+    };
+
+    namespace expr{
+        template<typename SrcExp, int dimdst, int dimsrc>
+        struct Plan< ReshapeExp<SrcExp,dimdst,dimsrc> >{
+        public:
+            Plan( const ReshapeExp<SrcExp,dimdst,dimsrc> &e )
+                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                const index_t idx = y * oshape0_ + x;
+                return src_.Eval( idx / ishape0_, idx % ishape0_ );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t oshape0_, ishape0_;
+        };
+        // special work plan for 1 dimensional data
+        template<typename SrcExp,int dimdst>
+        struct Plan< ReshapeExp<SrcExp,dimdst,1> >{
+        public:
+            Plan( const ReshapeExp<SrcExp,dimdst,1> &e )
+                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return src_.Eval( 0, y * oshape0_ + x );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t oshape0_;
+        };
+    };
+    
+    namespace expr{
+        template<typename SrcExp,int dimsrc, int a1, int a2>
+        struct Plan< SwapAxisExp<SrcExp,dimsrc,a1,a2> >{
+        public:
+            Plan( const SwapAxisExp<SrcExp,dimsrc,a1,a2> &e )
+                : src_(MakePlan(e.src_)),
+                  shape1_( e.shape_.ProdShape( 1, a1 ) ),
+                  shape2_( e.shape_[a1] ),
+                  shape3_( e.shape_.ProdShape( a1+1, a2 ) ),
+                  shape4_( e.shape_[a2] ){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                const index_t y = i % shape1_;
+                i /= shape1_; 
+                const index_t z = i % shape2_;
+                i /= shape2_;
+                const index_t c = i % shape3_;
+                i /= shape3_;
+                const index_t n = i % shape4_;
+                // swap z and n
+                return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); 
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t shape1_, shape2_, shape3_, shape4_;
+        };
+
+        template<typename SrcExp,int dimsrc, int a2>
+        struct Plan< SwapAxisExp<SrcExp,dimsrc,0,a2> >{
+        public:
+            Plan( const SwapAxisExp<SrcExp,dimsrc,0,a2> &e )
+                : src_(MakePlan(e.src_)),
+                  shape0_( e.shape_[0] ),
+                  shape1_( e.shape_.ProdShape(1,a2) ),
+                  shape2_( e.shape_[a2] ){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{
+                // swap x and z
+                const index_t y = i % shape1_;
+                i /= shape1_; 
+                const index_t z = i % shape2_;
+                const index_t n = i / shape2_;
+                return src_.Eval(  ( n*shape0_ + x ) * shape1_ + y , z ); 
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t shape0_, shape1_, shape2_;
+        };
+    };
+
+    namespace expr{
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > {
+        public:
+            Plan( const PoolingExp<Reducer, SrcExp, srcdim> &e )
+                : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_),
+                  src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) {
+            }
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t py = i % new_height_;
+                const index_t y_start = py * kstride_;
+                const index_t y_end = min( y_start + ksize_, src_height_ );
+                const index_t px = j;
+                const index_t x_start = px * kstride_;
+                const index_t x_end = min( x_start + ksize_, src_width_ );
+                const index_t c = i / new_height_;
+
+                real_t res = Reducer::kInitV;
+                for (index_t y = y_start; y < y_end; ++y) {
+                    for (index_t x = x_start; x < x_end; ++x) {
+                        Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) );
+                    }
+                }
+                return res;
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t ksize_, kstride_;
+            const index_t src_height_, src_width_;
+            const index_t new_height_;
+        };
+
+        template<typename Reducer, typename Device>
+        struct Plan<UnPoolingExp<Reducer, Device> > {
+        public:
+            Plan(const UnPoolingExp<Reducer, Device> &e)
+                : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_),
+                  ksize_(e.ksize_), kstride_(e.kstride_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t x = j;
+                const index_t y = i % data_src_.shape[1];
+                const index_t c = i / data_src_.shape[1];
+                const real_t vsrc = data_src_[0][c][y][x];
+
+                const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_;
+                const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_;
+                const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]);
+                const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]);
+
+                real_t val = 0;
+                for( index_t py = py_min; py < py_max; ++py ){
+                    for( index_t px = px_min; px < px_max; ++px ){
+                        val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px];
+                    }
+                }
+                return val;
+            }
+        private:
+            Tensor<Device, 4> data_src_, data_pooled_, grad_pooled_;
+            const index_t ksize_;
+            const index_t kstride_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename SrcExp, int srcdim>
+        struct Plan< PaddingExp<SrcExp, srcdim> > {
+        public:
+            Plan(const PaddingExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]),
+                  src_height_(e.src_height_), src_width_(e.src_width_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                const index_t x = j;
+                const index_t y = i % new_height_;
+                const index_t c = i / new_height_;
+                if (y < pad_ || x < pad_) return 0.0f;
+                const index_t h = y - pad_;
+                const index_t w = x - pad_;
+                if (h < src_height_ && w < src_width_) {
+                    return src_.Eval(c * src_height_ + h, w);
+                } else {
+                    return 0.0f;
+                }
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t pad_;
+            const index_t new_height_;
+            const index_t src_height_;
+            const index_t src_width_;
+        };
+
+        template<typename SrcExp, int srcdim>
+        struct Plan<CroppingExp<SrcExp, srcdim> > {
+        public:
+            Plan(const CroppingExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), 
+                  new_height_(e.shape_[1]), src_height_(e.src_height_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                const index_t x = j;
+                const index_t y = i % new_height_;
+                const index_t c = i / new_height_;
+                const index_t h = y + pad_height_;
+                const index_t w = x + pad_width_;
+                return src_.Eval(c * src_height_ + h, w);
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t pad_height_, pad_width_;
+            const index_t new_height_;
+            const index_t src_height_;
+        };
+
+        template<typename SrcExp, int srcdim>
+        struct Plan< MirroringExp<SrcExp, srcdim> > {
+        public:
+            Plan(const MirroringExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), width_(e.shape_[0]){}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                return src_.Eval( i, width_ - j - 1 );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t width_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > {
+        public:
+            Plan( const ChannelPoolingExp<Reducer, SrcExp, srcdim> &e )
+                : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]),
+                  height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){
+            }
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t y = i % height_;
+                i /= height_;
+                const index_t c = i % channel_;
+                const index_t n = i / channel_;
+                const index_t x = j;
+                const index_t cstart = c < hnsize_ ? 0  : c - hnsize_;
+                const index_t cend   = min( c + hnsize_ + 1, channel_ );
+                real_t res = Reducer::kInitV;
+                for( index_t cc = cstart; cc < cend; ++ cc ){
+                    Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) );
+                }
+                return res;
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t channel_, height_, width_, hnsize_;
+        };
+    };
+}; // namespace mshadow
+
+#if MSHADOW_USE_SSE
+// implementations of SSE support, if possible
+#include "tensor_sse-inl.hpp"
+namespace mshadow{
+    namespace expr{
+        template<int dimdst>
+        struct SSECheck< Broadcast1DExp<cpu,dimdst,0> >{
+            const static bool kPass = true;
+        };
+        template<int dimdst>
+        struct SSEAlignCheck<2, Broadcast1DExp<cpu,dimdst,0> >{
+            inline static bool Check( const Broadcast1DExp<cpu,dimdst,0> &exp ){
+                return sse2::CheckAlign( exp.src_.dptr );
+            }
+        };
+        template<int dimdst>
+        class SSEPlan< Broadcast1DExp<cpu,dimdst,0> >{
+        public:
+            SSEPlan( const Broadcast1DExp<cpu,dimdst,0> &t )
+                :dptr_(t.src_.dptr){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
+                return sse2::FVec<real_t>( &dptr_[ x ] );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t  *dptr_;
+        };
+    };
+};
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_gpu-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_gpu-inl.hpp b/include/mshadow/tensor_gpu-inl.hpp
new file mode 100644
index 0000000..a2c1fc4
--- /dev/null
+++ b/include/mshadow/tensor_gpu-inl.hpp
@@ -0,0 +1,148 @@
+#ifndef MSHADOW_TENSOR_GPU_INL_HPP
+#define MSHADOW_TENSOR_GPU_INL_HPP
+/*!
+ * \file tensor_gpu-inl.hpp
+ * \brief implementation of GPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "tensor.h"
+
+#if !(MSHADOW_USE_CUDA)
+namespace mshadow {
+    // do nothing if no GPU operation is involved
+    inline void InitTensorEngine( int dev_id ){
+    }
+    inline void ShutdownTensorEngine( void ){
+    }
+};
+#else
+namespace mshadow {
+    #if (MSHADOW_USE_NVML)
+    inline int AutoSelectDevice(int device_count) {
+        // TODO nvml device id and cuda device id are not consistent
+        return 0;
+    }
+    #endif
+    inline void InitTensorEngine(int dev_id){
+        cudaDeviceProp prop;
+        int device_id = 0;
+        int device_count = 0;
+        cudaGetDeviceCount(&device_count);
+        utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration");
+        if (dev_id < 0) {
+            #if (MSHADOW_USE_NVML)
+            device_id = AutoSelectDevice(device_count);
+            #endif
+        } else {
+            device_id = dev_id;
+        }
+        utils::Assert( device_id < device_count, "Incorrect Device ID" );
+        utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" );
+        cudaGetDeviceProperties(&prop, device_id);
+        printf("Use CUDA Device %d: %s\n", device_id, prop.name);
+        cublasInit();
+    }
+    inline void ShutdownTensorEngine( void ){
+        cublasShutdown();
+    }
+
+    template<int dim>
+    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){
+        size_t pitch;
+        // common choice for cuda mem align unit is 32
+        if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){
+            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
+                                               obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
+            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
+        }else{
+            obj.shape.stride_ = obj.shape[0];
+            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
+                                               obj.shape.Size() * sizeof(real_t), 1 );
+            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+        }
+    }
+
+    template<int dim>
+    inline void FreeSpace(Tensor<gpu,dim> &obj){
+        cudaFree( obj.dptr ); obj.dptr = NULL;
+    }
+
+    template<typename A,typename B, int dim>
+    inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){
+        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
+        Tensor<A,2> dst = _dst.FlatTo2D();
+        Tensor<B,2> src = _src.FlatTo2D();
+        cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t),
+                                        src.dptr, src.shape.stride_ * sizeof(real_t),
+                                        dst.shape[0] * sizeof(real_t),
+                                        dst.shape[1], kind );
+        utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+    }
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src){
+        Copy( dst, src, cudaMemcpyDeviceToHost );
+    }
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src){
+        Copy( dst, src, cudaMemcpyDeviceToDevice );
+    }
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src){
+        Copy( dst, src, cudaMemcpyHostToDevice );
+    }
+};
+
+#ifdef __CUDACC__
+// the following part is included only if compiler is nvcc
+#include "cuda/tensor_gpu-inl.cuh"
+
+namespace mshadow{
+    template<typename Saver, typename E, int dim>
+    inline void MapPlan(Tensor<gpu,dim> _dst, const expr::Plan<E> &plan){
+        cuda::MapPlan<Saver>( _dst.FlatTo2D(), plan );
+    }
+
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
+        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
+        MapPlan<Saver>( dst, MakePlan( exp.self() ) );
+    }
+
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
+
+        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
+        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
+        cuda::MapReduceKeepLowest<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, eshape );
+    }
+
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        typedef Shape< ExpInfo<E>::kDim > EShape;
+        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
+        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
+        // use equvalent form
+        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep],
+                                  eshape.ProdShape(1,dimkeep), eshape[0] );
+        // call equavalent map red dim 2
+        cuda::MapReduceKeepDim2<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, pshape );
+    }
+
+    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2>& src ){
+        cuda::Softmax( dst, src );
+    }
+}; // namespace mshadow
+
+#endif // __CUDACC__
+
+#endif // MSHADOW_USE_CUDA
+#endif // TENSOR_GPU_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_io.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_io.h b/include/mshadow/tensor_io.h
new file mode 100644
index 0000000..2ce28b3
--- /dev/null
+++ b/include/mshadow/tensor_io.h
@@ -0,0 +1,137 @@
+#ifndef MSHADOW_TENSOR_IO_H
+#define MSHADOW_TENSOR_IO_H
+/*!
+ * \file tensor_io.h
+ * \brief definitions of I/O functions for mshadow tensor
+ * \author Tianqi Chen
+ */
+#include <cstdio>
+#include "tensor.h"
+
+namespace mshadow{
+    namespace utils{
+        /*! 
+         * \brief interface of stream I/O, used to serialize data, 
+         *   it is not restricted to only this interface in SaveBinary/LoadBinary
+         *   mshadow accept all class that implements Read and Write
+         */
+        class IStream{
+        public:
+            /*! 
+             * \brief read data from stream
+             * \param ptr pointer to memory buffer
+             * \param size size of block
+             * \return usually is the size of data readed
+             */
+            virtual size_t Read( void *ptr, size_t size ) = 0;        
+            /*! 
+             * \brief write data to stream
+             * \param ptr pointer to memory buffer
+             * \param size size of block
+             */
+            virtual void Write( const void *ptr, size_t size ) = 0;
+            /*! \brief virtual destructor */
+            virtual ~IStream( void ){}
+        };
+    };
+    
+    /*! 
+     * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+     * \param fo output binary stream
+     * \param src source data file
+     * \tparam dim dimension of tensor
+     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+     */
+    template<int dim,typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa SaveBinary */
+    template<int dim,typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src );
+
+    /*! 
+     * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
+     *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
+     *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
+     * \param fi output binary stream
+     * \param dst destination file
+     * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
+     * \tparam dim dimension of tensor     
+     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+     */
+    template<int dim,typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst, bool pre_alloc );
+    /*! \brief refer to comment of cpu ver \sa LoadBinary */
+    template<int dim,typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc );
+    
+    namespace utils{
+        /*! \brief implementation of file i/o stream */
+        class FileStream: public IStream{
+        public:
+            /*! \brief constructor */
+            FileStream( FILE *fp ):fp_(fp){}
+            virtual size_t Read( void *ptr, size_t size ){
+                return fread( ptr, size, 1, fp_ );
+            }
+            virtual void Write( const void *ptr, size_t size ){
+                fwrite( ptr, size, 1, fp_ );
+            }
+            /*! \brief close file */
+            inline void Close( void ){
+                fclose( fp_ );
+            }
+        private:
+            FILE *fp_;
+        };
+    };
+};
+
+namespace mshadow{
+    // implementations
+    template<int dim, typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src_ ){
+        fo.Write( src_.shape.shape_, sizeof(index_t) * dim );
+        Tensor<cpu,2> src = src_.FlatTo2D();
+        for( index_t i = 0; i < src.shape[1]; ++ i ){
+            fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] );
+        }
+    }
+    template<int dim, typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ){
+        // copy to CPU, then save
+        Tensor<cpu,dim> tmp( src.shape ); 
+        AllocSpace( tmp );
+        Copy( tmp, src );
+        SaveBinary( fo, tmp );
+        FreeSpace( tmp );
+    }
+
+    template<int dim, typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst_, bool pre_alloc ){
+        Shape<dim> shape;
+        utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" );
+        if( pre_alloc ){
+            utils::Assert( shape == dst_.shape );
+        }else{
+            dst_.shape = shape; AllocSpace( dst_ );
+        }
+        Tensor<cpu,2> dst = dst_.FlatTo2D();
+        if( dst.shape[0] == 0 ) return;        
+        for( index_t i = 0; i < dst.shape[1]; ++ i ){
+            utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" );
+        }
+    } 
+    template<int dim, typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc ){
+        Tensor<cpu,dim> tmp;
+        LoadBinary( fi, tmp, false );
+        if( pre_alloc ){
+            utils::Assert( tmp.shape == dst.shape );
+        }else{
+            dst.shape = tmp.shape; AllocSpace( dst );
+        }
+        Copy( dst, tmp );
+        FreeSpace( tmp );
+    }
+};
+#endif // TENSOR_IO_H