You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/03 07:48:16 UTC

[11/60] incubator-singa git commit: SINGA-163 - Reorganize the project folder layout

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/cluster_rt.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cluster_rt.h b/include/singa/utils/cluster_rt.h
deleted file mode 100644
index 4ab48bd..0000000
--- a/include/singa/utils/cluster_rt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_CLUSTER_RT_H_
-#define SINGA_UTILS_CLUSTER_RT_H_
-
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-namespace singa {
-
-typedef void (*rt_callback)(void *contest);
-
-struct RTCallback {
-  rt_callback fn;
-  void* ctx;
-};
-
-/**
- * ClusterRuntime is a runtime service that manages dynamic configuration
- * and status of the whole cluster. It mainly provides following services:
- *    1)  Provide running status of each server/worker
- *    2)  Translate process id to (hostname:port)
- */
-class ClusterRuntime {
- public:
-  // ClusterRuntime have different implementation determined when compiling
-  static ClusterRuntime* Create(const std::string&host, int job_id);
-
-  virtual ~ClusterRuntime() {}
-  /**
-   * Initialize the runtime instance
-   */
-  virtual bool Init() = 0;
-  /**
-   * register the process, and get a unique process id
-   *
-   * \return the process id, -1 if failed
-   */
-  virtual int RegistProc(const std::string& host_addr, int pid) = 0;
-  /**
-   * translate the process id to host address
-   *
-   * \return the host and port, "" if no such proc id 
-   */
-  virtual std::string GetProcHost(int proc_id) = 0;
-  /**
-   * Server: watch all workers in a server group,
-   * will be notified when all workers have left
-   */
-  virtual bool WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) = 0;
-  /**
-   * Worker: join a server group (i.e. start to read/update these servers)
-   */
-  virtual bool JoinSGroup(int gid, int wid, int s_group) = 0;
-  /**
-   * Worker: leave a server group (i.e. finish its all work)
-   */
-  virtual bool LeaveSGroup(int gid, int wid, int s_group) = 0;
-};
-
-/*
- * A ClusterRuntime implementation for single-process environment
- */
-class SPClusterRT : public ClusterRuntime {
- public:
-  ~SPClusterRT();
-
-  bool Init() override;
-  int RegistProc(const std::string& host_addr, int pid) override;
-  std::string GetProcHost(int proc_id) override;
-  bool WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) override;
-  bool JoinSGroup(int gid, int wid, int s_group) override;
-  bool LeaveSGroup(int gid, int wid, int s_group) override;
-
- private:
-  std::vector<std::string> proc_list_;
-  std::map<int, std::vector<RTCallback*>> grp_callbacks_;
-  std::map<int, int> grp_count_;
-  std::mutex lock_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_CLUSTER_RT_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/common.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/common.h b/include/singa/utils/common.h
deleted file mode 100644
index 0bcec58..0000000
--- a/include/singa/utils/common.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_COMMON_H_
-#define SINGA_UTILS_COMMON_H_
-
-#include <google/protobuf/message.h>
-#include <unordered_map>
-#include <sstream>
-#include <string>
-#include <vector>
-#include <utility>
-#include "singa/proto/common.pb.h"
-
-namespace singa {
-
-using std::vector;
-using std::string;
-std::string IntVecToString(const std::vector<int>& vec);
-std::string VStringPrintf(std::string fmt, va_list l);
-std::string StringPrintf(std::string fmt, ...);
-
-/**
- * Locate the position of the arg in arglist.
- *
- * @param argc total num of arguments
- * @param arglist all arguments
- * @param the searched argument
- * @return the position of arg in the arglist; -1 if not found.
- */
-int ArgPos(int argc, char** arglist, const char* arg);
-void CreateFolder(const std::string name);
-/**
- * Slice a set of large Params into small pieces such that they can be roughtly
- * equally partitioned into a fixed number of boxes.
- *
- * @param num total number of boxes to store the small pieces
- * @param sizes size of all Params
- * @return all slices for each Param
- */
-const std::vector<std::vector<int>> Slice(int num,
-    const std::vector<int>& sizes);
-/**
- * Partition slices into boxes.
- *
- * @param num number of boxes
- * @param slices slice sizes
- * @return box id for each slice
- */
-const std::vector<int> PartitionSlices(int num, const std::vector<int>& slices);
-/*
-inline void Sleep(int millisec=1){
-  std::this_thread::sleep_for(std::chrono::milliseconds(millisec));
-}
-*/
-int gcd(int a, int b);
-int LeastCommonMultiple(int a, int b);
-/*
-inline float rand_real() {
-  return  static_cast<float>(rand_r())/(RAND_MAX+1.0f);
-}
-*/
-std::string GetHostIP();
-void SetupLog(const std::string& workspace, const std::string& model);
-
-/**
- * Performance mtrics.
- */
-class Metric {
- public:
-  Metric() {}
-  explicit Metric(const std::string& str);
-  /**
-   * Add one metric.
-   *
-   * If the metric exist, the aggregate. Otherwise create a new entry for it.
-   *
-   * @param name metric name, e.g., 'loss'
-   * @param value metric value
-   */
-  void Add(const std::string& name, float value);
-  void Add(const std::string& name, float value, int count);
-  /**
-   * reset all metric counter and value to 0
-   */
-  void Reset();
-  /**
-   * Generate a one-line string for logging
-   */
-  std::string ToLogString() const;
-  /**
-   * Serialize the object into a string
-   */
-  std::string ToString() const;
-  /**
-   * Parse the metric from a string
-   */
-  void ParseFrom(const std::string& msg);
-
- private:
-  std::unordered_map<std::string, std::pair<int, float>> entry_;
-};
-
-using google::protobuf::Message;
-void Im2col(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col);
-void Col2im(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_im);
-void ForwardMaxPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top, float* mask);
-void BackwardMaxPooling(const float* top, const float* mask, const int num,
-    const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    float* bottom);
-void ForwardAvgPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top);
-void BackwardAvgPooling(const float* top, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* bottom);
-
-void ReadProtoFromTextFile(const char* filename, Message* proto);
-void WriteProtoToTextFile(const Message& proto, const char* filename);
-void ReadProtoFromBinaryFile(const char* filename, Message* proto);
-void WriteProtoToBinaryFile(const Message& proto, const char* filename);
-
-/**
- * Write a string (e.g., graph reprensetation of a net) into a text file.
- */
-void WriteStringToTextFile(const string& filename, const string& context);
-
-/**
- * Parse metric pairs (key = value[, key = value]) from string
- */
-const vector<std::pair<string, float>> GetMetricFromString(const string& disp);
-}  // namespace singa
-
-#endif  // SINGA_UTILS_COMMON_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/context.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h
index 3490d29..55e783d 100644
--- a/include/singa/utils/context.h
+++ b/include/singa/utils/context.h
@@ -30,7 +30,17 @@
 #include <vector>
 
 #ifdef USE_GPU
-#include "singa/utils/cuda_utils.h"
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+/* Code block avoids redefinition of cudaError_t error */ \
+do { \
+cudaError_t error = condition; \
+CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+} while (0)
 
 #ifdef USE_CUDNN
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
deleted file mode 100644
index 1270e92..0000000
--- a/include/singa/utils/cuda_utils.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/**
- * The code is adapted from that of Caffe which is under BSD 2 Clause License.
- *
- * COPYRIGHT
- * All contributions by the University of California:
- * Copyright (c) 2014, The Regents of the University of California (Regents)
- * All rights reserved.
- * All other contributions:
- * Copyright (c) 2014, the respective contributors
- * All rights reserved.
- */
-#ifndef SINGA_UTILS_CUDA_UTILS_H_
-#define SINGA_UTILS_CUDA_UTILS_H_
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#endif  // SINGA_UTILS_CUDA_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/graph.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/graph.h b/include/singa/utils/graph.h
deleted file mode 100644
index 2462808..0000000
--- a/include/singa/utils/graph.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_GRAPH_H_
-#define SINGA_UTILS_GRAPH_H_
-
-#include <stack>
-#include <string>
-#include <map>
-#include <vector>
-namespace singa {
-using std::string;
-using std::map;
-
-/**
- * Node class representing a layer in a neural net.
- *
- * TODO remove layer dependent fields, like origin, and partition_id, to make
- * it an independent and simple class.
- */
-class Node {
- public:
-  /**
-   * Node constructor.
-   *
-   * @param name identifier of the node, e.g, layer name.
-   */
-  explicit Node(string name);
-  /**
-   * Construct a node with specified attributes.
-   * @param name node identifier
-   * @param attrs node attributes for printing, including "shape", "color", etc.
-   * Depending on the visulization engine, if using graphviz, then the attribute
-   * list is http://www.graphviz.org/content/attrs.
-   */
-  Node(string name, const std::map<string, string>& attrs);
-  /**
-   * @deprecated {to make the Graph class an independent class.}
-   *
-   * Node constructor used for model partitioning.
-   *
-   * This node is a partition of some node.
-   * @param name node name
-   * @param origin  name of the original node
-   * @param id partition id of this node
-   * @param proto conf of the corresponding layer
-   */
-  Node(const string& name, const std::string& origin, int id, void* proto);
-  ~Node() {}  // the proto field is deleted outside by other functions
-
-
-  void AddDstNode(Node* dst);
-  void AddSrcNode(Node* src);
-  void RemoveDstNode(Node* dst);
-  void RemoveSrcNode(Node* src);
-
-  string name = "";
-  //! name of the origin node/layer from which is node is derived
-  string origin = "";
-  //! partition id
-  int partition_id = 0;
-  //! proto of the corresponding layer
-  void* proto = nullptr;
-  std::vector<Node*> srcnodes;
-  std::vector<Node*> dstnodes;
-  //!< node attribute including shape, color, etc.
-  std::map<string, string> attrs;
-};
-
-/**
- * Neuralnet is constructed by creating a graph with each node representing one
- * layer at first. After topology sort for graph nodes, layers are created and
- * connected.
- */
-class Graph {
- public:
-  Graph() {}
-  ~Graph();
-  const Graph Reverse() const;
-  /**
-   * @return all nodes of the graph
-   */
-  inline const std::vector<Node*>& nodes() const {
-    return nodes_;
-  }
-  /**
-   * @param name node name
-   * @return return the node of given name
-   */
-  inline Node* node(const string& name) const {
-    return name2node_.at(name);
-  }
-  /**
-   * Add an exiting node into this graph.
-   */
-  void AddNode(Node* node);
-  /**
-   * Creat an node with the given name and add it into the graph.
-   * @return the newly created node.
-   */
-  Node* AddNode(const string& name);
-  /**
-   * Create an node with the given name and attributes.
-   */
-  Node* AddNode(const string& name, const std::map<string, string>& attrs);
-  /**
-   * @deprecated {remove layer related info from node attrs}
-   * Add a node with given name and other info.
-   */
-  Node* AddNode(const std::string& name, const std::string& origin, int id,
-                void* proto);
-  /**
-   * Add an edge connecting the two given nodes.
-   */
-  void AddEdge(Node* srcnode, Node* dstnode);
-  /**
-   * Add an edge connecting the two nodes with the given name.
-   */
-  void AddEdge(const string& src, const std::string& dst);
-  /**
-   * Add an edge connecting the two given nodes, the edge attributes are also
-   * given.
-   */
-  void AddEdge(Node* srcnode, Node* dstnode,
-      const std::map<string, string>& attrs);
-  /**
-   * Add an edge connecting the two nodes with the given names, the edge
-   * attributes are also given, which are used for printing.
-   * http://www.graphviz.org/content/attrs
-   */
-  void AddEdge(const string& src, const std::string& dst,
-      const std::map<string, string>& attrs);
-
-  /**
-   * Remove the edge connecting the two given nodes.
-   */
-  void RemoveEdge(Node* src, Node* dst);
-  /**
-   * Remove the edge connecting two nodes with the given names.
-   */
-  void RemoveEdge(const string &src, const std::string& dst);
-  /**
-   * Dump the graph into json string which can be used to draw a picture by
-   * graphviz.
-   *
-   * It calls ToJson(const std::map<std::string, std::string>& label) with
-   * empty label mapping.
-   */
-  string ToJson() const;
-  /**
-   * \copybreif ToJson()
-   *
-   * @param label information to be displayed as label for each node
-   */
-  string ToJson(const map<std::string, std::string>& label) const;
-  /**
-   * Do topology sort for all nodes of the graph.
-   */
-  void Sort();
-
- private:
-  /**
-   *
-   * @return the name of the edge connecting src to dst
-   */
-  const string GetEdgeName(const string& src, const string& dst) const {
-    return src + "-->" + dst;
-  }
-
- private:
-  std::vector<Node*> nodes_;
-  std::map<string, Node*> name2node_;
-  std::map<string, std::map<string, string>> edge_attrs_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_GRAPH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/image_transform.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/image_transform.h b/include/singa/utils/image_transform.h
deleted file mode 100644
index 2867ad2..0000000
--- a/include/singa/utils/image_transform.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_IMAGE_TRANSFORM_H_
-#define SINGA_UTILS_IMAGE_TRANSFORM_H_
-
-#include <glog/logging.h>
-// TODO(wangwei) provide image transformation API, the implementation can be
-// done by opencv, manual transform, or mshadow.
-namespace singa {
-
-void ImageTransform(const float* in, const float* mean, bool mirror, int h_crop,
-    int w_crop, int h_offset, int w_offset, int channel, int height, int width,
-    float scale, float* out);
-}  // namespace singa
-
-#endif  // SINGA_UTILS_IMAGE_TRANSFORM_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/job_manager.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/job_manager.h b/include/singa/utils/job_manager.h
deleted file mode 100644
index 7f1b4f1..0000000
--- a/include/singa/utils/job_manager.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_JOB_MANAGER_H_
-#define SINGA_UTILS_JOB_MANAGER_H_
-
-#include <string>
-#include <vector>
-
-#ifdef USE_ZOOKEEPER
-#include "singa/utils/zk_service.h"
-#endif
-
-namespace singa {
-
-struct JobInfo {
-  int id;
-  int procs;
-  std::string name;
-};
-
-class JobManager {
- public:
-  // host is comma separated host:port pairs, each corresponding to a zk server.
-  // e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
-  explicit JobManager(const std::string& host);
-
-  // NOTICE: Init must be called once, before start to use other functions
-  bool Init();
-  // generate a unique job id
-  bool GenerateJobID(int* id);
-  // generate a list of hosts for a job conf
-  bool GenerateHostList(const char* host_file, const char* job_file,
-                        std::vector<std::string>* list);
-  // list all jobs recorded in zk
-  bool ListJobs(std::vector<JobInfo>* jobs);
-  // list running processes for a job
-  bool ListJobProcs(int job, std::vector<std::string>* procs);
-  // remove a job path in zk
-  bool Remove(int job);
-  // remove all job paths in zk
-  bool RemoveAllJobs();
-  // remove all singa related paths in zk
-  bool CleanUp();
-
- private:
-  const int kJobsNotRemoved = 10;
-
-  bool CleanPath(const std::string& path, bool remove);
-  std::string ExtractClusterConf(const char* job_file);
-
-  std::string host_ = "";
-#ifdef USE_ZOOKEEPER
-  int timeout_ = 30000;
-  ZKService zk_;
-#endif
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_JOB_MANAGER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
deleted file mode 100644
index cf1d227..0000000
--- a/include/singa/utils/math_addr.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_MATH_ADDR_H_
-#define SINGA_UTILS_MATH_ADDR_H_
-
-extern "C" {
-#include <cblas.h>
-}
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#endif
-
-#include "singa/utils/singa_op.h"
-
-namespace singa {
-template<typename Dtype>
-Dtype cpu_asum(int n, const Dtype* A, int inc) {
-  return cblas_sasum(n, A, inc);
-}
-
-template<typename Dtype>
-void cpu_gemm(const Dtype * A, const Dtype * B,
-    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
-    const bool TranA, const bool TranB, Dtype * C) {
-  int lda, ldb;
-  CBLAS_TRANSPOSE tA, tB;
-  lda = TranA ? m : k;
-  ldb = TranB ? k : n;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  tB = TranB ? CblasTrans : CblasNoTrans;
-  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
-      B, ldb, beta, C, n);
-}
-
-// should be very careful:
-// m is the length of B, and n is the length of C , A is a n*m matrix
-template<typename Dtype>
-void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
-    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
-  CBLAS_TRANSPOSE tA;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1);
-}
-
-template<typename Dtype>
-void cpu_axpy(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  cblas_saxpy(n, alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-void cpu_scale(const int n, const Dtype alpha, Dtype * A) {
-  cblas_sscal(n, alpha, A, 1);
-}
-
-template<typename Dtype>
-void cpu_copy(const int n, const Dtype* A, Dtype *B) {
-  cblas_scopy(n, A, 1, B, 1);
-}
-
-template<typename Dtype>
-Dtype cpu_dot(const int n, const Dtype * A, const Dtype * B) {
-  Dtype sum = 0;
-  for (int i = 0 ; i < n ; i++)
-    sum += A[i] * B[i];
-  return sum;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, Dtype * B) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(A[i], &B[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype * B, Dtype * C) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(A[i], B[i], &C[i]);
-  }
-}
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, A[i], &B[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, const Dtype * A, const Dtype * B,
-    Dtype * C) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, A[i], B[i], &C[i]);
-  }
-}
-// element-wise generalized operation defined in Op
-
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A[i], n, B+i*n);
-  }
-}
-
-
-template<typename Dtype>
-void cpu_softmax(int nb_rows, int nb_cols, const Dtype* A, Dtype* B) {
-  for (int i = 0; i < nb_rows; i++) {
-    const Dtype* dptr = A + i * nb_cols;
-    Dtype mmax = dptr[0];
-    for (int x = 1; x < nb_cols; ++x)
-      if (mmax < dptr[x]) mmax = dptr[x];
-    Dtype sum = 0.0f;
-    for (int x = 0; x < nb_cols; ++x) {
-      dptr[x] = std::exp(dptr[x] - mmax);
-      sum += dptr[x];
-    }
-    for (int x = 0; x < nb_cols; ++x) {
-      dptr[x] /= sum;
-    }
-  }
-}
-
-
-
-template<typename Dtype, typename URNG>
-void cpu_sample_uniform(URNG& g, int n, Dtype low, Dtype high, Dtype* A) {
-  std::uniform_real_distribution<Dtype> distribution(low, high);
-  for (int i = 0; i < n; i++)
-    A[i] = distribution(g);
-}
-
-template<typename Dtype, typename URNG>
-void cpu_sample_gaussian(URNG& g, int n, Dtype mean, Dtype std, Dtype* A) {
-  std::normal_distribution<Dtype> distribution(mean, std);
-  for (int i = 0; i < n; i++)
-    A[i] = distribution(g);
-}
-
-#ifdef USE_GPU
-template<typename Dtype>
-Dtype gpu_asum(cublasHandle_t handle, int n, const Dtype* A, int inc) {
-  Dtype result = 0.0;
-  cublasSasum(handle, n, A, inc, &result);
-  return result;
-}
-
-template<typename Dtype>
-void gpu_gemm(cublasHandle_t handle, const Dtype * A, const Dtype * B,
-    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
-    const bool TranA, const bool TranB, Dtype * C) {
-  int lda = TranA ? m : k;
-  int ldb = TranB ? k : n;
-  int ldc = n;
-  cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
-      A, lda, &beta, C, ldc);
-}
-
-template<typename Dtype>
-void gpu_gemv(cublasHandle_t handle, const Dtype * A, const Dtype * B,
-    const int m, const int n, const Dtype alpha, const Dtype beta,
-    const bool TranA, Dtype * C) {
-  int lda = n;
-  cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
-}
-
-template<typename Dtype>
-void gpu_axpy(cublasHandle_t handle, const int n, const Dtype alpha,
-    const Dtype * A, Dtype * B) {
-  cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-void gpu_scale(cublasHandle_t handle, const int n, const Dtype alpha,
-    Dtype * A) {
-  cublasSscal(handle, n, &alpha, A, 1);
-}
-
-template<typename Dtype>
-Dtype gpu_dot(cublasHandle_t handle, const int n, const Dtype * A,
-    const Dtype * B) {
-  Dtype result = 0.0;
-  cublasSdot(handle, n, A, 1, B, 1, &result);
-  return result;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
-  Op::CudaMap(alpha, A, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, Dtype * B) {
-  Op::CudaMap(A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype * B, Dtype * C) {
-  Op::CudaMap(A, B, C, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  Op::CudaMap(alpha, A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, const Dtype beta,
-  const Dtype * A, const Dtype * B, Dtype * C) {
-  Op::CudaMap(alpha, beta, A, B, C, n);
-}
-// element-wise generalized operation defined in Op
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A[i], n, B+i*n);
-  }
-}
-
-
-template<typename Dtype, typename URNG>
-void gpu_sample_uniform(URNG g, int n, Dtype low, Dtype high, Dtype* A) {
-  curandGenerateUniform(g, A, n);
-}
-
-template<typename Dtype, typename URNG>
-void gpu_sample_gaussian(URNG g, int n, Dtype mean, Dtype std, Dtype* A) {
-  curandGenerateNormal(g, A, n, mean, std);
-}
-
-// expand each element in A into a row of B
-#endif  // USE_GPU
-
-}  // namespace singa
-#endif  // SINGA_UTILS_MATH_ADDR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
deleted file mode 100644
index abe7722..0000000
--- a/include/singa/utils/math_blob.h
+++ /dev/null
@@ -1,762 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_MATH_BLOB_H_
-#define SINGA_UTILS_MATH_BLOB_H_
-
-#include <vector>
-#include <algorithm>
-#include <thread>
-#include "singa/utils/blob.h"
-#include "singa/utils/singa_op.h"
-#include "singa/utils/math_addr.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-
-namespace singa {
-
-#define NO_GPU LOG(FATAL) << "Not compiled with GPU";
-/**
- * \file math_blob.h is not tested thorough.
- * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now.
- */
-/************* BLAS level 1 *****************/
-/**
- * Scale each element of A with alpha, and put the result into A.
- * Ai = alpha*Ai
- * Use blas scale internally.
- */
-template<typename Dtype>
-void Scale(Dtype alpha, Blob<Dtype> * B) {
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_scale(B->count(), alpha, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_scale(context->cublas_handle(device), B->count(), alpha,
-        B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/**
- * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
- */
-template<typename Dtype>
-void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(),
-        B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/************* BLAS level 2 *****************/
-/**
- * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
- * Loose shape checking:
- * - dim of A >=2
- * - row of A is shape(0) (no transpose)
- * - column of A(.T) == B.count()
- * - rows of A(.T) == C.count()
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, vector
- * @param[in, out] C, vector
- */
-template<typename Dtype>
-void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
-    const Blob<Dtype>& B, Blob<Dtype>* C) {
-  CHECK_EQ(A.shape().size(), 2);
-  int a1, a2, m, n;
-  a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0);
-  a2 = A.transpose() ? A.shape(0) : A.count() / A.shape(0);
-  m = B.count();
-  n = C->count();
-  CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
-  CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
-
-  bool TranA = A.transpose();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
-        C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n,
-        alpha, beta, TranA, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
- * Loose shape checking:
- * - dim of A >=2
- * - A.count() % B.count() == 0
- * - B.count() == C.count()
- *
- * @param[in] A input matrix
- * @param[in] B input vector
- * @param[out] C output vector
- */
-template <typename Dtype>
-void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  GEMV(Dtype(1), Dtype(0), A, B, C);
-}
-
-/************* BLAS level 3 *****************/
-/**
- * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
- *
- * Tranpose is considered for A and B.
- * Loose shape checking:
- * - the first dimension is row (no transpose) or col (with transpose) size
- * - shapes match for matrix multiplication
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, matrix
- * @param[in, out] C, matrix
- */
-template <typename Dtype>
-void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype> * C) {
-  CHECK_GE(A.shape().size(), 2);
-  CHECK_GE(B.shape().size(), 2);
-  CHECK_GE(C->shape().size(), 2);
-  int a1, a2, b1, b2, m, n;
-  CHECK(!C->transpose());
-  a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0);
-  a2 = A.count() / a1;
-  b1 = B.transpose() ? B.count() /B.shape(0) : B.shape(0);
-  b2 = B.count() / b1;
-  m = C->shape(0);
-  n = C->count() / m;
-  CHECK_EQ(a2, b1);
-  CHECK_EQ(a1, m);
-  CHECK_EQ(b2, n);
-
-  int k = a2;
-  bool TranA = A.transpose();
-  bool TranB = B.transpose();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB,
-        C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] A input matrix
- * @param[in] B input matrix
- * @param[out] C output matrix
- */
-template <typename Dtype>
-void MMDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  GEMM(Dtype(1), Dtype(0), A, B, C);
-}
-
-
-/*********************** Inner and Outer product****************************/
-/**
- * Inner product for two vectors.
- * Loose shape checking, A.count() == B.count.
- *
- * @param[in] A, input vector (shape checking using A.count()).
- * @param[in] B, input vector (shape checking using B.count()).
- * @return inner product value.
- */
-template <typename Dtype>
-Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
-  Dtype res = 0;
-  CHECK_EQ(A.count(), B.count());
-  int n = A.count();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    res = cpu_dot(n, A.cpu_data(), B.cpu_data());
-  } else {
-#ifdef USE_GPU
-    res = gpu_dot(context->cublas_handle(device), n, A.gpu_data(),
-        B.gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-  return res;
-}
-
-/**
- * Outer product, C = A ** B, transpose is disabled.
- * Loose shape checking, A.count() * B.count() == C.count()
- *
- * @param[in] A, input vector
- * @param[in] B, input vector
- * @param[out] C, output matrix
- */
-template <typename Dtype>
-void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) {
-  CHECK(!C->transpose());  // do not support C.T now.
-
-  int m = A.count();
-  int n = B.count();
-  CHECK_EQ(C->count(), m * n);
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, Dtype(1), Dtype(0), false,
-        false, C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        m, n, 1, Dtype(1), Dtype(0), false, false, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/*********************** Element-wise functions ***********************/
-/**
- * Apply the function from Op for each element in A and put the result into B,
- * i.e., Bi = Op(Ai).
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Apply the function from Op for each element in A and B, and put the result
- * into C, i.e., Ci = Op(Ai, Bi).
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) {
-  CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
-  CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
-  // cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Bi = Op(alpha, Ai)
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Ci = Op(alpha, Ai, Bi)
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
-        C->mutable_cpu_data());
-  } else {
-    // TODO(wangwei) implement gpu version.
-    NO_GPU;
-  }
-}
-
-/**
- * Currently use std::copy which has shown better performance than memcpy.
- * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
- * TODO(wangwei) test blas copy vs std::copy.
- *
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Dtype>
-void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-  CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(B->mutable_gpu_data()),
-             A.gpu_data(), sizeof(Dtype) * A.count(), cudaMemcpyDefault));
-#else
-  NO_GPU;
-#endif
-  }
-}
-
-
-/**
- * B = alpha + A
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(Dtype alpha,  const Blob<Dtype> & A, Blob<Dtype> * B) {
-  Map<singa::op::Add<Dtype>, Dtype>(alpha, A, B);
-}
-
-/**
- * C = A + B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(A, C);
-  AXPY(Dtype(1), B, C);
-}
-
-/**
- * B = alpha - A
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sub<Dtype>, Dtype>(alpha, A, B);
-}
-
-/**
- * C = A - B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(A, C);
-  AXPY(Dtype(-1), B, C);
-}
-
-/**
- * C = A * B, implemented using
- * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Mult<Dtype>, Dtype>(A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-
-/**
- * C = A / B, implemented using
- * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Div<Dtype>, Dtype>(A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-/**
- * B = sqrt(A)
- */
-template<typename Dtype>
-void Sqrt(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sqrt<Dtype>, Dtype>(A, B);
-}
-/**
- * B = square(A)
- */
-template<typename Dtype>
-void Square(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Square<Dtype>, Dtype>(A, B);
-}
-/**
- * B = exp(A)
- */
-template<typename Dtype>
-void Exp(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Exp<Dtype>, Dtype>(A, B);
-}
-/**
- * B = log(A)
- */
-template<typename Dtype>
-void Log(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  Map<singa::op::Log<Dtype>, Dtype>(A, B);
-}
-/**
- * B = tanh(A)
- */
-template<typename Dtype>
-void Tanh(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  Map<singa::op::Tanh<Dtype>, Dtype>(A, B);
-}
-/*************************1D<-->2D op/transform***************************/
-/**
- * Add A to each column of B, i.e., Bij = alpha*Ai + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  if (B->transpose()) {
-    B->set_transpose(false);
-    MVAddRow(alpha, beta, A, B);
-    B->set_transpose(true);
-  } else {
-    CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
-    int m = A.count(), n = B->count() / m;
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    auto context = Singleton<Context>::Instance();
-    int device = context->device_id(std::this_thread::get_id());
-    if (device < 0) {
-      cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, false, false,
-          B->mutable_cpu_data());
-    } else {
-#ifdef USE_GPU
-      gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m,
-          n, 1, alpha, beta, false, false, B->mutable_gpu_data());
-#else
-      NO_GPU;
-#endif  // USE_GPU
-    }
-  }
-}
-/**
- * Add A to each column of B, i.e., Bij = Ai + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddCol(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddCol(Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Add A to each row of B, i.e., Bij = alpha*Aj + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  if (B->transpose()) {
-    B->set_transpose(false);
-    MVAddCol(alpha, beta, A, B);
-    B->set_transpose(true);
-  } else {
-    CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
-    int n = A.count(), m = B->count() / n;
-    auto context = Singleton<Context>::Instance();
-    int device = context->device_id(std::this_thread::get_id());
-    if (device < 0) {
-      Blob<Dtype> one(m);
-      one.SetValue(1);
-      cpu_gemm(one.cpu_data(), A.cpu_data(), m, n, 1, alpha, beta,
-          false, false, B->mutable_cpu_data());
-    } else {
-#ifdef USE_GPU
-      singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
-          m, n, n);
-#else
-      NO_GPU;
-#endif  // USE_GPU
-    }
-  }
-}
-/**
- * Add A to each row of B, i.e., Bij = Aj + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddRow(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddRow(Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Copy A to each column of B, i.e., Bij = Ai
- * Loose shape checking, B.count() % A.count() == 0,
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void RepmatCol(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddCol(Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Copy A to each row of B, i.e., Bij = Aj
- * Loose shape checking, B.count() % A.count() == 0,
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void RepmatRow(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddRow(Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Sum all columns of matrix A to a column vector B,
- * i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
- * Loose shape checking, A.count() % B.count() == 0.
- * # columns of A = A.count() / B.count().
- */
-template<typename Dtype>
-void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-  int m = B->count(), n = A.count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
-        A.transpose(), false, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    singa_gpu_sum_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Sum all rows of matrix A to a row vector B,
- * i.e., Bj = \sum_i {alpha*Aij}+beta*Bj
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- */
-template<typename Dtype>
-void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-  int n = B->count(), m = A.count() / n;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    Blob<Dtype> one(m);
-    one.SetValue(1);
-    cpu_gemm(one.cpu_data(), A.cpu_data(), 1, n, m, alpha, beta, false,
-             A.transpose(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    singa_gpu_sum_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Reduce each row of A to an element of B.
- * Loose shape checking, A.count() % B.count() == 0.
- * # columns of A = A.count() / B.count().
- */
-template<typename Op, typename Dtype>
-void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "Row size not match B length";
-  int m = B->count(), n = A.count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Duplicate each element of A into a row of B.
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Op, typename Dtype>
-void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(B->count() % A.count(), 0) << "Row size of B not match length of A";
-  int m = A.count(), n = B->count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Average the absolute values.
- */
-template<typename Dtype>
-Dtype Asum(const Blob<Dtype>& A) {
-  if (A.count() == 0) return Dtype(0);
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  Dtype ret = Dtype(0);
-  if (device < 0) {
-    ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
-  } else {
-#ifdef USE_GPU
-    ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1)
-      / A.count();
-#else
-    NO_GPU;
-#endif
-  }
-  return ret;
-}
-
-
-/*************Random Sample***************/
-template<typename Dtype>
-void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
-  auto context = Singleton<Context>::Instance();
-  const auto& thread = std::this_thread::get_id();
-  int device = context->device_id(thread);
-  if (device < 0) {
-    cpu_sample_uniform(*context->rand_generator(thread), A->count(), low, high,
-        A->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high,
-        A->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-template<typename Dtype>
-void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
-  auto context = Singleton<Context>::Instance();
-  const auto& thread = std::this_thread::get_id();
-  int device = context->device_id(thread);
-  if (device < 0) {
-    cpu_sample_gaussian(*context->rand_generator(thread), A->count(), mean, std,
-        A->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_sample_gaussian(context->curand_generator(thread), A->count(),
-        mean, std, A->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/************** Other functions ****************/
-template<typename Dtype>
-void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_GT(nb_rows, 0);
-  CHECK_EQ(A.count() % nb_rows, 0);
-  CHECK_EQ(A.count(), B->count());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
-      B->mutable_cpu_data());
-  } else {
-    // TODO(wangwei) implement the GPU version.
-    NO_GPU;
-  }
-}
-
-template<typename Dtype>
-void Zero(Blob<Dtype>* B) {
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    B->SetValue(0);
-  } else {
-#ifdef USE_GPU
-    cudaMemset(B->mutable_gpu_data(), 0, B->count() * sizeof(float));
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-}  // end of namespace singa
-
-#endif  // SINGA_UTILS_MATH_BLOB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h
deleted file mode 100644
index 0239d3d..0000000
--- a/include/singa/utils/math_kernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#ifndef SINGA_UTILS_MATH_KERNEL_H_
-#define SINGA_UTILS_MATH_KERNEL_H_
-
-namespace singa {
-
-extern "C" {
-  void singa_gpu_softmaxloss_forward(int n, int dim, const float *prob,
-      const int *label, float *loss);
-
-  void singa_gpu_softmaxloss_backward(int n, int dim, float scale,
-      const int *label, float *grad);
-
-  void singa_gpu_sum_vec(float *data, float *sum , int n);
-
-  void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_sum_row(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_add_vec_row(const float *src_vec_data,
-    const float *src_mat_data, float *des_mat_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_exp(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_log(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sigmoid(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_relu(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_relu_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_tanh(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_softplus(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_square(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_square_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sqrt(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_pow(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_mult(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_div(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_set_value(float *data, float value, int n);
-
-  void singa_gpu_threshold(const float *src_data, float *des_data,
-      float alpha, int n);
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_MATH_KERNEL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/param.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/param.h b/include/singa/utils/param.h
deleted file mode 100644
index 319f2b4..0000000
--- a/include/singa/utils/param.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_PARAM_H_
-#define SINGA_UTILS_PARAM_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "singa/comm/msg.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/blob.h"
-
-namespace singa {
-using std::vector;
-/**
- * Base parameter generator which intializes parameter values.
- */
-class ParamGenerator {
- public:
-  static ParamGenerator* Create(const ParamGenProto& proto);
-
-  virtual ~ParamGenerator() {}
-
-  virtual void Init(const ParamGenProto& proto) { proto_ = proto; }
-  virtual void Fill(Blob<float>* data);
-
- protected:
-  ParamGenProto proto_;
-};
-
-class GaussianGen : public ParamGenerator {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class GaussianSqrtFanInGen : public GaussianGen {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class UniformGen : public ParamGenerator {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class UniformSqrtFanInGen : public UniformGen {
- public:
-  void Fill(Blob<float>* data) override;
-};
-
-class UniformSqrtFanInOutGen : public UniformGen {
- public:
-  void Fill(Blob<float>* data) override;
-};
-
-/**
- * Base paramter class.
- *
- * The Param object is a set of parameters, e.g., the (sub) weight matrix or
- * (sub) bias vector.
- *
- * It has at a gradient Blob and data Blob for gradients and parameter values.
- * Since some layers (or neuralnet) share parameter values, the data Blob is a
- * shared pointer which can be assigned to many Param objects' data field.
- *
- * It provides access methods like data(), grad(). It also provides functions
- * for generating messages and parsing messages to transferring the Param
- * objects among worker-worker, worker-server and server-server.
- *
- * Param objects are of different sizes, which makes it hard to acheive
- * load-balance among servers. Hence, we slice large Param objects into small
- * pieces. At the server side, one slice is a Param object.
- */
-class Param {
- public:
-  /**
-   * Create an instance of (sub) Param class based on the type from the
-   * configuration.
-   *
-   * @param[in] conf configuration
-   * @param a pointer to an instance
-   */
-  static Param* Create(const ParamProto& conf);
-
-  /**
-   * Try to slice the Param objects (from a neural net) into a given number of
-   * servers (groups) evenly. This is to achieve load-balance among servers.
-   *
-   * It does not change the Param objects, but just computes the length of each
-   * slice.
-   *
-   * @param num number of servers (groups) for maintaining the Param objects.
-   * @param params all Param objects from a neural net.
-   * @return the length of each slice.
-   */
-  static const vector<int> ComputeSlices(int num, const vector<Param*>& params);
-  /**
-   * It computes the length of each slice and slices the Param objects by adding
-   * the slicing information into every Param object.
-   *
-   * @copydetails ComputeSlices()
-   */
-  static void SliceParams(int num, const vector<Param*>& params);
-
-  Param() {}
-  virtual ~Param() {}
-  void Init(const ParamProto& proto) { proto_ = proto; }
-  /**
-   * Setup param object
-   *
-   * @param conf param configuration, include learning rate multiplier etc.
-   * @param shape one value per dimension
-   */
-  virtual void Setup(const std::vector<int>& shape);
-  /*
-   * Fill the values according to init method, e.g., gaussian distribution.
-   *
-   * @param version initial version
-   */
-  virtual void InitValues();
-  virtual void InitValues(int version);
-  /**
-   * Share the data blob from other Param objects.
-   *
-   * @param other the Param object whose owner owns the data blob
-   * @param cpu_only if true, share only cpu memory (used for training with
-   * multi-gpu cards); else, share both cpu and gpu memory.
-   */
-  void ShareDataFrom(Param* other, bool cpu_only);
-  /**
-   * Share both data and grad from other param
-   */
-  void ShareFrom(Param* other);
-  /**
-   * Init param values from checkpoint blob.
-   */
-  void FromProto(const BlobProto& blob);
-  void FromProto(const std::string str);
-  /**
-   * Dump param values to blob.
-   */
-  void ToProto(BlobProto* blob);
-  /**
-   * Add a slice
-   *
-   * @param slice_id
-   * @param size num of floats for this slice
-   */
-  void AddSlice(int slice_id, int size);
-  /**
-   * Scale the learning rate when updating parameters in the Param object
-   */
-  inline float lr_scale() const { return proto_.lr_scale(); }
-  /**
-   * Scale the weight decay when updating parameters in the Param object
-   */
-  inline float wd_scale() const { return proto_.wd_scale(); }
-  /**
-   * Parameter name used for Param re-use in other model or sharing between
-   * layers
-   */
-  inline const std::string& name() const { return proto_.name(); }
-  inline void set_name(const std::string& name) { proto_.set_name(name); }
-  /**
-   * If it shares data from others, then owner is the id of that Param,
-   * otherwise it is itself's id.
-   */
-  inline int owner() const { return proto_.owner(); }
-  /**
-   * ID start from 0 and ordered for all Param from the same neuralnet
-   */
-  inline int id() const { return proto_.id(); }
-  /**
-   * Set ID
-   */
-  inline void set_id(int id) {
-    proto_.set_id(id);
-    proto_.set_owner(id);
-  }
-  inline int version() const { return version_; }
-  inline void set_version(int v) { version_ = v; }
-  /**
-   * @return the version of the Param when the last Update request was issued.
-   */
-  inline int last_version() const { return last_version_; }
-  inline void set_last_version(int v) { last_version_ = v; }
-
-  /**
-   * @return the sharing Param name which is configured by users in conf file.
-   */
-  inline const std::string& share_from() const { return proto_.share_from(); }
-   /**
-    * @return num of parameters in this Param obj.
-    */
-  inline const std::vector<int>& shape() const { return data_.shape(); }
-  inline int size() const { return data_.count(); }
-  inline const Blob<float>& data() const { return data_; }
-  inline Blob<float>* mutable_data() { return &data_; }
-  inline const Blob<float> &grad() const { return grad_; }
-  inline Blob<float> *mutable_grad() { return &grad_; }
-  inline float* mutable_cpu_data() { return data_.mutable_cpu_data(); }
-  inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); }
-  inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); }
-  inline float* mutable_cpu_update() { return update_.mutable_cpu_data(); }
-  /**
-   * @return slice start ID
-   */
-  inline int slice_start() const { return slice_start_; }
-  inline int num_slices() const { return num_slices_; }
-
-  /**
-   * Below are message/request related functions.
-   * The basic communication workflows are as follow:
-   *------------------------------------------------------------------------
-   *         |Put         |Get           |Update           |Sync
-   *------------------------------------------------------------------------
-   * Generate|(stub)      |(stub)        |(stub)           |(server)
-   * Message |GenPutMsg   |GenGetMsg     |GenUpdateMsg     |GenSyncMsg
-   *------------------------------------------------------------------------
-   * Handle  |(server)    |(server)      |(server)         |(server)
-   * Message |HandlePutMsg|HandleGetMsg  |ParseUpdateMsg   |HandleSyncMsg
-   *         |            |              |GenUpdateResMsg  |
-   *------------------------------------------------------------------------
-   * Handle  |            |(stub)        |(stub)           |(server)
-   * Response|            |ParseGetResMsg|ParseUpdateResMsg|ParseSyncResMsg
-   *------------------------------------------------------------------------
-   */
-
-  /**
-   * Generate the message for a put request, i.e., put parameters to a server
-   *
-   * This function is called at worker/stub side.
-   * @param copy decides whether to copy the parameter values from the server.
-   * @param slice_idx index of the slice from which the message is generated.
-   * @return generated message without setting src, dst, target fields.
-   */
-  virtual Msg* GenPutMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a get request, i.e., get parameters from a server
-   * \copydetails GenPutMsg(bool, int);
-   */
-  virtual Msg* GenGetMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a update request, i.e., pass info to server for
-   * parameter update.
-   * \copydetails GenPutMsg(bool, int);
-   */
-  virtual Msg* GenUpdateMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a synchronization request between server groups.
-   *
-   * This function is called at server side where the Param is actually a slice
-   * of an original Param object.
-   * */
-  virtual Msg* GenSyncMsg(int offset, int size);
-  /**
-   * Server handling function for put request.
-   *
-   * @param msg request
-   * @param reserve if true reserve the msg space for the calling function;
-   * otherwise the msg should be freed inside the function.
-   * @return resposne message
-   */
-  virtual Msg* HandlePutMsg(Msg** msg, bool reserve);
-  /**
-   * Server handling function for put request.
-   *
-   * \copydetails HandleGetMsg(Msg**, bool reserve)
-   */
-  virtual Msg* HandleGetMsg(Msg** msg, bool reserve);
-  /**
-   * Server parse update requests.
-   * \copydetails GenUpdateResponseMsgs(const std::vector<Msg*>& msgs);
-   */
-  virtual void ParseUpdateMsgs(const std::vector<Msg*>& msgs);
-  /**
-   * Generate the messages to response the update requests.
-   *
-   * This function is called at the server side, where the Param is actually a
-   * slice of an original Param object.
-   *
-   * @param msgs for synchronous training, there would be multiple procs in
-   * which workers sharing the same Param (slice) objects. Their update requests
-   * is bufferred and handled together. For asynchrnous training, there is only
-   * request in msgs.
-   * @return response messages
-   */
-  virtual const std::vector<Msg*>
-    GenUpdateResponseMsgs(std::vector<Msg*>* msgs, bool reserve);
-  /**
-   * Server handling function for synchronization message
-   *
-   * \copydetails HandleGetMsg(Msg**, bool reserve)
-   */
-  virtual Msg* HandleSyncMsg(Msg** msg, bool reserve);
-  /**
-   * Worker/Stub parsing function for get response.
-   *
-   * @param msg
-   * @param slice_idx index for the slice
-   */
-  virtual int ParseGetResponseMsg(Msg* msg, int slice_idx);
-  /**
-   * Worker/Server parsing function for update response
-   *
-   * \copydetails ParseGetResponseMsg(Msg**, int);
-   */
-  virtual int ParseUpdateResponseMsg(Msg* msg, int slice_idx);
-  /**
-   * Server parsing function for synchronization response.
-   *
-   * \copydetails ParseGetResponseMsg(Msg** , int);
-   */
-  virtual int ParseSyncResponseMsg(Msg* msg, int slice_idx);
-
- protected:
-  /**
-   * Implement the common code of ParseGetResponseMsg and ParseUpdateResponseMsg
-   * \copydetails ParseSyncResponseMsg(Msg* msg, int slice_idx);
-   */
-  void ParseResponseMsg(Msg* msg, int slice_idx);
-
- protected:
-  //!< param version updated by the Update/Sync/Get response
-  //!< only the owner param is initialized.
-  int version_ = -1;
-  //!< param version before last Update/Sync/Get request, set from version_
-  int last_version_ = -1;
-  //!< the global ID of the first slice
-  int slice_start_ = 0;
-  //!< total num of slices for this Parm obj
-  int num_slices_ = 0;
-  // offset and size of each slice
-  std::vector<int> slice_offset_;
-  std::vector<int> slice_size_;
-  // for debug. Put request has no feedback, we do not track its pending status
-  std::vector<bool> pending_get_;
-  std::vector<bool> pending_update_;
-  int num_pending_requests_ = 0;
-  // data, gradient, history gradient of this parameter
-  Blob<float> data_, grad_, history_, update_;
-  ParamProto proto_;
-};
-
-/**
- * ParamEntry is used for aggregating gradients of Params shared by workers from
- * the same group.
- *
- * For each worker group, every unique Param object has a ParamEntry object.
- * Param objects sharing the same values are associated with the same
- * ParamEntry.
- */
-class ParamEntry {
- public:
-  ParamEntry() {}
-  ParamEntry(int total, Param* p);
-  /**
-   * Associate the counter to a Param object.
-   *
-   * @param p
-   * @param local 1 if it is used by workers in this procs, 0 otherwise
-   */
-  void AddParam(bool local, Param* p);
-  int next_version = -1;  // next_version & num_update are directly used by stub
-  int num_update = 0;
-  int num_local = 0;  //!< # local workers using the shared parameter
-  int num_total = 0;  //!< # total workers using the shared parameter
-  //!< Shares are deleted by neuralnet's destructor
-  std::vector<Param*> shares;
-};
-
-inline int ParamTrgt(int param_id, int slice_id) {
-  return (param_id << 16) | slice_id;
-}
-
-inline int ParamID(int param_trgt) {
-  return param_trgt >> 16;
-}
-
-inline int SliceID(int param_trgt) {
-  static const int mask = (1 << 16) -1;
-  return param_trgt & mask;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_PARAM_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
deleted file mode 100644
index 7499eb1..0000000
--- a/include/singa/utils/singa_op.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_SINGA_OP_H_
-#define SINGA_UTILS_SINGA_OP_H_
-
-#include <cmath>
-#include <algorithm>
-
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include "singa/utils/math_kernel.h"
-#endif  // USE_GPU
-
-namespace singa {
-
-namespace op {
-
-/**
- * b = e^a
- */
-template<typename Dtype>
-struct Exp {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = exp(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_exp(a, b, n);
-  }
-#endif  // USE_GPU
-};
-/**
- * b = log(a), base is e
- */
-template<typename Dtype>
-struct Log {
-  inline static void Map(const Dtype & a, Dtype *b) {
-    *b = log(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_log(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sigmoid {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1.0f / (1.0f + expf(-a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sigmoid(a, b, n);
-  }
-#endif  // USE_GPU
-};
-template<typename Dtype>
-struct SigmoidGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a * (1.0f - a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sigmoid_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Relu {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = std::max(a, 0.0f);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct ReluGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a > 0 ? 1 : 0;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Tanh {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = tanhf(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct TanhGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1 - a * a;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Softplus {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = logf(1 + expf(a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_softplus(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct SoftplusGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1.0f / (1.0f + expf(-a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_softplus_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Square {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a * a;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_square(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct SquareGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 2 * sqrt(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_square_grad(a, b, 1, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sqrt {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = sqrt(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sqrt(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-/*********************************************************************/
-/**
- * c = pow(a, b), i.e., c = a^b
- */
-template<typename Dtype>
-struct Pow {
-  inline static void Map(const Dtype & a, const Dtype &b, Dtype * c) {
-    *c = pow(a, b);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_pow(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Add {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a + b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-//    singa::singa_gpu_add(a, b, c, n); // TODO(haibo)
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sub {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a - b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-//    singa::singa_gpu_add(a, b, c, n);  // TODO(haibo)
-  }
-#endif  // USE_GPU
-};
-
-
-template<typename Dtype>
-struct Mult {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a * b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_mult(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Div {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a / b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_div(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-
-/*********************************************************************/
-template<typename Dtype>
-struct Set {
-  inline static void Map(Dtype alpha, Dtype * a) {
-    *a = alpha;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha, Dtype * a, int n) {
-    singa::singa_gpu_set_value(a, alpha, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Threshold {
-  inline static void Map(Dtype alpha, const Dtype & a, Dtype * b) {
-    *b =  a < alpha ? 1.0f : 0.0f;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha,  const Dtype * a,
-      Dtype * b, int n) {
-    singa::singa_gpu_threshold(a, b, alpha, n);
-  }
-#endif  // USE_GPU
-};
-
-};  // namespace op
-
-};  // namespace singa
-
-#endif  // SINGA_UTILS_SINGA_OP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd1e4afa/include/singa/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/updater.h b/include/singa/utils/updater.h
deleted file mode 100644
index 33ad8a7..0000000
--- a/include/singa/utils/updater.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_UPDATER_H_
-#define SINGA_UTILS_UPDATER_H_
-
-#include <string>
-#include "singa/proto/job.pb.h"
-#include "singa/utils/param.h"
-#include "singa/neuralnet/layer.h"
-
-namespace singa {
-using std::string;
-/**
- * Base learning rate generator.
- *
- * Generate learning rate for a give training step/iteration.
- * There are many different ways to change the learning rate through time/step.
- * Users can inherint this class to implement their own change method.
- */
-class LRGenerator {
- public:
-  static LRGenerator* Create(const LRGenProto& proto);
-
-  virtual ~LRGenerator() {}
-
-  virtual void Init(const LRGenProto& proto) { proto_ = proto; }
-  /**
-   * @param step training step/iteration.
-   * @return base learning rate regardless of step
-   */
-  virtual float Get(int step) { return proto_.base_lr(); }
-
- protected:
-  LRGenProto proto_;
-};
-
-class FixedStepLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
- private:
-  int last_idx_ = 0;
-};
-
-class StepLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class LinearLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class ExpLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class InvLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class InvTLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-/**
- * Updater for Param.
- */
-class Updater {
- public:
-
-  /* added for python binding */
-  static Updater* CreateUpdater(const string str);
-  /* ------------------------ */
-
-  static Updater* Create(const UpdaterProto& proto);
-
-  virtual ~Updater() {}
-
-  virtual void Init(const UpdaterProto &proto);
-  virtual void Update(int step, Param* param, float grad_scale) = 0;
-  void Clip(const float low, const float high, Param* param);
- protected:
-  UpdaterProto proto_;
-  LRGenerator* lr_gen_;
-  float weight_decay_;
-  float momentum_;
-  float clip_low_, clip_high_;
-};
-
-class SGDUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-class AdaGradUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-
-class NesterovUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-class RMSPropUpdater : public Updater {
- public:
-  void Init(const UpdaterProto &proto) override;
-  void Update(int step, Param* param, float grad_scale) override;
-
- protected:
-  float rho_;
-  float delta_;
-};
-
-class AdaDeltaUpdater : public Updater {
- public:
-  void Init(const UpdaterProto &proto) override;
-  void Update(int step, Param* param, float grad_scale) override;
-
- protected:
-  float rho_;
-  float delta_;
-};
-
-class AdamUpdater : public Updater {
-  public:
-   void Init(const UpdaterProto &proto) override;
-   void Update(int step, Param* param, float grad_scale) override;
-
-  protected:
-   float beta1_;
-   float beta2_;
-   float delta_;
-};
-
-class AdamMaxUpdater : public Updater {
-  public:
-   void Init(const UpdaterProto &proto) override;
-   void Update(int step, Param* param, float grad_scale) override;
-
-  protected:
-   float beta1_;
-   float beta2_;
-   float delta_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_UPDATER_H_