You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/03 07:48:36 UTC
[31/60] incubator-singa git commit: SINGA-167 - Add Tensor Math function APIs

SINGA-167 - Add Tensor Math function APIs

Add basic linalg functions for Tensor

Add blas functions for Tensor.

Unify gemm and gemv in Tensor::Mult

this commit also contains code for Param class, which woud be removed in the next commit.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/02851fac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/02851fac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/02851fac

Branch: refs/heads/dev
Commit: 02851fac11ae6455b60d1cd5be4c2b6f142696cf
Parents: e36bc92
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri May 13 21:00:48 2016 +0800
Committer: wangwei <wa...@gmail.com>
Committed: Tue May 17 00:40:23 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                       |   2 +-
 include/singa/core/math.h            | 273 ---------------------
 include/singa/core/tensor.h          | 285 +++++++++++-----------
 include/singa/model/layer.h          |  23 +-
 include/singa/model/param.h          |  97 ++++++++
 src/core/device/device.cc            |   1 +
 src/core/math/cpp_math.cc            |  54 -----
 src/core/math/cuda_math.cc           |  48 ----
 src/core/math/opencl_math.cc         |  24 --
 src/core/tensor/tensor.cc            | 379 ++++++++++++++++++++++++++----
 src/core/tensor/tensor_math.h        | 302 ++++++++++++++++++++++++
 src/core/tensor/tensor_math_cpp.h    |  57 +++++
 src/core/tensor/tensor_math_cuda.h   |  53 +++++
 src/core/tensor/tensor_math_opencl.h |  28 +++
 src/model/layer/layer.cc             |   8 +
 src/proto/layer.proto                |  22 +-
 test/singa/test_cpp_math.cc          |   4 +-
 test/singa/test_tensor.cc            |  35 +--
 test/singa/test_tensor_math.cc       |  84 +++++++
 19 files changed, 1135 insertions(+), 644 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21b3804..67a82e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
 PROJECT(singa)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++11")
 
 # Flags
 IF(UNIX OR APPLE)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/core/math.h
----------------------------------------------------------------------
diff --git a/include/singa/core/math.h b/include/singa/core/math.h
deleted file mode 100644
index 511d9ee..0000000
--- a/include/singa/core/math.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef SINGA_CORE_MATH_H_
-#define SINGA_CORE_MATH_H_
-#include <type_traits>
-#include "singa/core/common.h"
-#include "singa/utils/logging.h"
-
-namespace singa {
-
-/// \file math.h Math functions for linear algebra, neural net and random
-/// operations.
-/// All functions have a template argument, DType for DataType, Lib for the
-/// backend library, e.g., lib::Cublas, lib::Cudnn, etc.
-
-/// Some operations would have many config/hyper-parameters, e.g., Conv, and
-/// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
-/// To separate the modules, we pass a OpConf pointer to the Tensor Op function.
-/// The specific fields are implemented by inheriting OpConf, and casting the
-/// pointer between the base and the sub-class.
-class OpConf {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<OpConf, T>::value,
-                  "The cast type must be a sub-class of OpConf");
-    return static_cast<T*>(this);
-  }
-};
-
-// ================Linear algebra functions====================================
-template <typename DType, typename Lib>
-void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Base is e, Neper number
-template <typename DType, typename Lib>
-void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Natual logarithm, the base is e, Neper number.
-template <typename DType, typename Lib>
-void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Do v^x for every v from the input tensor
-template <typename DType, typename Lib>
-void Pow(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Do v^x for every v from the lhs and every x from rhs
-template <typename DType, typename Lib>
-void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Clamp every element into [low, high]
-template <typename DType, typename Lib>
-void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
-           Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = x + input
-template <typename DType, typename Lib>
-void Add(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = x * input
-/// div could be enabled by calling Mult with 1/x
-template <typename DType, typename Lib>
-void Mult(int count, DType x, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs + rhs
-template <typename DType, typename Lib>
-void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs - rhs
-template <typename DType, typename Lib>
-void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs * rhs
-template <typename DType, typename Lib>
-void Mult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
-          Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = lhs / rhs
-template <typename DType, typename Lib>
-void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
-template <typename DType, typename Lib>
-void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
-           Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
-void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Sum the rows of the input matrix into a vector
-template <typename DType, typename Lib>
-void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
-template <typename DType, typename Lib>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// Add the vector v to every column of A as the column of ret
-template <typename DType, typename Lib>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
-template <typename DType, typename Lib>
-void Amax(int count, const Blob* input, int* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// return the index of the element with the min value.
-template <typename DType, typename Lib>
-void Amin(int count, const Blob* input, int* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// ret = sum |x| for all x in input
-template <typename DType, typename Lib>
-void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret = alpha * input + ret
-template <typename DType, typename Lib>
-void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret *= x
-template <typename DType, typename Lib>
-void Scale(int count, DType x, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
-         Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(A) = m, cols(A) = n.
-template <typename DType, typename Lib>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
-          DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-/// ret = alpha * op(A) * op(B) + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(A) = m, cols(A) = n.
-template <typename DType, typename Lib>
-void GEMV(bool transA, bool transB, int m, int n, int k, DType alpha,
-          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ================Random functions===========================================
-// The random generator should be extracted from ctx.
-template <typename DType, typename Lib>
-void Uniform(int count, DType low, DType high, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-template <typename DType, typename Lib>
-void Gaussian(int count, DType mean, DType std, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-template <typename DType, typename Lib>
-void Bernoulli(int count, DType p, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/// ret[i] would be 1 with prob p[i] and 0 with 1-p[i]. 0<= p[i] <= 1
-template <typename DType, typename Lib>
-void Bernoulli(int count, const Blob* p, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ================Neural net functions=======================================
-/// Do 2D conv.
-/// c is input image channel, w is input width, h is input height
-/// nb_kernel is output channel, kw, and kh are kenerl width and height
-/*
-template <typename DType, typename Lib>
-void Conv2D(int c, int w, int h, int nb_kernel, int kw, int kh,
-           const Blob* input, const Blob* kernel, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-*/
-}  // namespace singa
-
-#endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 725f657..4278078 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -20,23 +20,29 @@
 #define SINGA_CORE_TENSOR_H_
 
 #include <vector>
+#include <tuple>
 
 #include "singa/core/common.h"
 #include "singa/core/device.h"
-#include "singa/core/math.h"
 #include "singa/proto/core.pb.h"
 #include "singa/utils/logging.h"
 
 using std::vector;
+using std::tuple;
 namespace singa {
 
 typedef vector<int> Shape;
 inline int Product(Shape shape) {
   if (shape.size() == 0)
     return 0;
+  return Product(shape.begin(), shape.end());
+}
+
+inline int Product(vector<int>::iterator begin, vector<int>::iterator end) {
+  CHECK(begin != end);
   int v = 1;
-  for (auto s : shape)
-    v *= s;
+  for (auto it = being; it < end; it++)
+    v* = *it;
   return v;
 }
 
@@ -60,19 +66,20 @@ inline int SizeOf(DataType t) {
 class Tensor {
  public:
   ~Tensor();
-  Tensor() = default;
-  explicit Tensor(const Shape& shape, DataType dtype = kFloat32);
+  Tensor();
+  Tensor(Shape&& shape, DataType dtype = kFloat32);
+  Tensor(const Shape& shape, DataType dtype = kFloat32);
+  Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32);
   Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor& from);
-
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(Tensor&& from);
 
   /// For functions in xx_math.cc to access the blob.
   /// Users should not operate against Blob directly.
-  /// It will malloc memory for the tensor if not allocated before.
+  /// blob_ is allocated in constructors.
   Blob* blob() const {
     return blob_;
   }
@@ -82,9 +89,9 @@ class Tensor {
   }
 
   /// Return immutable Tensor values with given type.
-  template <typename T>
-  const T* data() {
-    return static_cast<const T*> (blob()->data());
+  template <typename DType>
+  const DType* data() const {
+    return static_cast<const DType*> (blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -96,20 +103,28 @@ class Tensor {
     return shape_;
   }
 
+  int nDim() const {
+    return shape_.size();
+  }
+
   bool transpose() const {
     return transpose_;
   }
 
+  /// Return number of total elements
   int Size() const {
     return blob_->size() / SizeOf(data_type_);
   }
 
+  /// Return memory size (i.e., Bytes)
   int MemSize() const {
     return blob_->size();
   }
 
+  /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
   void ReShape(const Shape& shape);
 
+  /// Reset the data type, it would reallocate blob if type changes.
   void AsType(DataType type);
 
   /// Reset the device.
@@ -119,8 +134,9 @@ class Tensor {
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
-  /// For init the tensor values, copy 'size' bytes data.
-  void CopyDataFromHostPtr(const void* src, size_t size);
+  /// For init the tensor values, copy 'num' elements.
+  template<typename DType>
+  void CopyDataFromHostPtr(const DType* src, int num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
@@ -141,49 +157,39 @@ class Tensor {
   /// Copy the meta info with data blob shared.
   void operator=(Tensor&& t);
 
+
   void operator+=(const Tensor& t);
-  /*
-  void operator+=(Tensor&& t);
+  // void operator+=(Tensor&& t);
   void operator-=(const Tensor& t);
-  void operator-=(Tensor&& t);
+  // void operator-=(Tensor&& t);
   void operator*=(const Tensor& t);
-  void operator*=(Tensor&& t);
+  // void operator*=(Tensor&& t);
   void operator/=(const Tensor& t);
-  void operator/=(Tensor&& t);
+  // void operator/=(Tensor&& t);
 
   // Scalar operations.
 
   /// T is a scalar type
-  template <typename T>
-  void operator+=(const T x);
+  template<typename DType>
+  void operator+=(DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator-=(const T x);
+  template <typename DType>
+  void operator-=(const DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator*=(const T x);
+  template <typename DType>
+  void operator*=(const DType x);
 
   /// T is a scalar type
-  template <typename T>
-  void operator/=(const T x);
-
-  void Log(int base = 2);
-  void Tanh();
-  void Sigmoid();
-  void ReLU();
-
-  // random functions.
-  void Uniform(float low, float high);
-  template <typename T>
-  void Gaussian(float mean, float std);
+  template <typename DType>
+  void operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);
   /// load Tensor from proto msg
   // void FromProto(const TensorProto& t);
-  */
+
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
@@ -194,142 +200,131 @@ class Tensor {
   Shape shape_;
 };
 
-/// For tensors with sparse content, e.g., missing columns or rows.
+// For tensors with sparse content, e.g., missing columns or rows.
 // class SparseTensor : public Tensor {};
 
-// ==================Simple Linear Algebra Operations=========================
-/*
-Tensor Tanh(const Tensor& t);
-Tensor Log(const Tensor& t);
-Tensor Sigmoid(const Tensor& t);
-Tensor ReLU(const Tensor& t);
-Tensor Softmax(const Tensor& t);
-*/
+/// Copy 'num' elements of src to dst.
+/// The first 'src_offset' ('dst_offset') elements will be skipped.
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int msize,
+              int num,
               int src_offset = 0,
               int dst_offset = 0);
 
-// element-wise ops
+/// Copy 'nBytes' bytes of src data to dst.
+/// The first 'src_offset' ('dst_offset') bytes will be skipped.
+void CopyRawData(Tensor* dst,
+              const Tensor& src,
+              int nBytes,
+              int src_offset = 0,
+              int dst_offset = 0);
+
+// ==================Simple Linear Algebra Operations=========================
+Tensor Abs(const Tensor& t);
+Tensor Exp(const Tensor& t);
+Tensor Log(const Tensor& t);
+Tensor ReLU(const Tensor& t);
+Tensor Sigmoid(const Tensor& t);
+Tensor Sign(const Tensor& t);
+Tensor Sqrt(const Tensor& t);
+Tensor Tanh(const Tensor& t);
+
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
+/// and shape_[axis+1]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor Softmax(const Tensor& t, int axis = -1);
+void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
+
+/// Element-wise opeartion, ret[i]=t[i]^x
+template<typename DType>
+Tensor Pow(const Tensor& t, DType x);
+/// Element-wise opeartion, ret[i]=t[i]^x
+template<typename DType>
+void Pow(const Tensor& t, DType x, Tensor* ret);
+/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor& base, Tensor exp);
+/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
+void Pow(const Tensor& base, const Tensor& exp, Tensor* ret);
 
 Tensor operator+(const Tensor& lhs, const Tensor& rhs);
 void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-/*
 Tensor operator-(const Tensor& lhs, const Tensor& rhs);
 void Sub(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 Tensor operator*(const Tensor& lhs, const Tensor& rhs);
-void operator*(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+void EltwiseMult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 Tensor operator/(const Tensor& lhs, const Tensor& rhs);
-void operator/(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+void Div(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
-template <typename T>
-Tensor operator+(const T x, const Tensor& t);
-template <typename T>
-void operator+(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator+(const Tensor& t, DType x);
+template <typename DType>
+void Add(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator-(const T x, const Tensor& t);
-template <typename T>
-void operator-(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator-(const Tensor& t, DType x);
+template <typename DType>
+void Sub(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator*(const T x, const Tensor& t);
-template <typename T>
-void operator*(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator*(const Tensor& t, DType x);
+template <typename DType>
+void EltwiseMult(const Tensor& t, DType x, Tensor* ret);
 
-template <typename T>
-Tensor operator/(const T x, const Tensor& t);
-template <typename T>
-void operator/(const T x, const Tensor& t, Tensor* ret);
+template <typename DType>
+Tensor operator/(const Tensor& t, DType x);
+template <typename DType>
+void Div(const Tensor& t, DType x, Tensor* ret);
 
 //================Blas operations============================================
+// ===== Level 1
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
+// void Asum(Tensor Context* ctx);
+
+// template <typename DType>
+// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = lhs * rhs
+template <typename DType>
 Tensor Mult(const Tensor& lhs, const Tensor& rhs);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = lhs * rhs
+template <typename DType>
 void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
 
-tempalte<typename T> T Dot(const Tensor& lhs, const Tensor& rhs);
-
-//================Neural Net operations======================================
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  ret = alpha lhs * rhs + beta * ret
+template <typename DType>
+Tensor Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
+template <typename DType>
+void Mult(DType alpha, const Tensor& lhs, DType beta, const Tensor& rhs,
+    Tensor* C);
 
-/// Convolution Op. 'Conf' is ConvConf;
-void Conv(const OpConf* conf,
-          const Tensor& input,
-          const Tensor& W,
-          const Tensor &b,
-          Tensor* ret);
+// tempalte<typename DType> T Dot(const Tensor& lhs, const Tensor& rhs);
 
 //================Random operations==========================================
-Tensor Uniform(float low, float high, const Shape& shape, Device* dev);
-
-Tensor Gaussian(float mean, float std, const Shape& shape, Device* dev);
-*/
-//============================================================================
-/// typedef DType accroding to type value.
-/// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                               \
-  do {                                                              \
-    switch (type) {                                                 \
-      case kFloat32: {                                              \
-        typedef float DType;                                        \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kInt: {                                                  \
-        typedef int DType;                                          \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kChar: {                                                 \
-        typedef char DType;                                         \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      default:                                                      \
-        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
-    }                                                               \
-  } while (0)
-
-/// typedef DType and Lib according to values of type and lib respectively.
-/// type is from DataType, and lib is from LibType.
-/// DType and Lib would be used in __VA_ARGS__.
-#define TYPE_LIB_SWITCH(dtype, DType, ltype, Lib, ...)                 \
-  do {                                                               \
-    const int _SwitchShift = 3;                                      \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                 \
-    switch (_SwitchHash) {                                           \
-      case ((kFloat32 << _SwitchShift) + kCuda): {                   \
-        typedef float DType;                                          \
-        typedef lib::Cuda Lib;                                            \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kCudnn): {                  \
-        typedef float DType;                                          \
-        typedef lib::Cudnn Lib;                                           \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kCpp): {                    \
-        typedef float DType;                                          \
-        typedef lib::Cpp Lib;                                             \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      case ((kFloat32 << _SwitchShift) + kOpencl): {                \
-        typedef float DType;                                          \
-        typedef lib::Opencl Lib;                                          \
-        { __VA_ARGS__ }                                              \
-        break;                                                       \
-      }                                                              \
-      default:                                                       \
-        LOG(FATAL) << "Unknown combination of data type "            \
-                   << DataType_Name(dtype) << " and library "        \
-                   << LibType_Name(ltype);                             \
-    }                                                                \
-  } while (0)
-
-
+/// For each element x set x = 0 if random() < p; otherwise x = 1.
+Tensor Bernoulli(float p, Blob* t);
+/// Fill in Tensor 't' following uniform distribution.
+Tensor Uniform(float low, DType high, Blob* t);
+/// Fill in Tensor 't' following Gaussian distribution.
+Tensor Gaussian(float mean, DType std, Blob* t);
 
+//================Neural Net operations======================================
+// following API of cudnn, e.g., conv, pool, lrn, batchnorm, softmax
+void ConvFwd(const ConvConf& conf, const Tensor& x, const Tensor& w, Tensor* y);
+void ConvBwdBias(const ConvConf& conf, const Tensor& dy, Tensor* db);
+void ConvBwdFilter(const ConvConf& conf, const Tensor& dy, const Tensor& x,
+                   Tensor* dw);
+void ConvBwdData(const ConvConf& conf, const Tensor& dy, const Tensor& w,
+                 Tensor* db);
+void PoolFwd(const PoolConf& conf, const Tensor& x, Tensor* y,
+             Tensor* mask = nullptr);
+void PoolBwd(const PoolConf& conf, const Tensor& y, const Tensor& dy,
+             const Tensor& x, Tensor* dx);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 37f3fa8..7b9b6d4 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -45,7 +45,9 @@ class Layer {
   }
 
   /// Set meta data fields configured in 'conf' (a proto message).
-  virtual void Setup(const LayerConf& conf) {}
+  virtual void Setup(const LayerConf& conf) {
+    name_ = conf.name();
+  }
 
   /// Do feature transformation for given 'input' Tensor.
   /// It is the forward pass for feed-forward nets and rnn nets.
@@ -67,6 +69,7 @@ class Layer {
                                                const vector<Tensor>& input) {
     return vector<Tensor>{};
   }
+  // return <dx>  <dw (ParamGrad)>
 
   /// Move the layer (including its parameters and other Tensor) onto the given
   /// device
@@ -82,28 +85,26 @@ class Layer {
   }
 
   /// Serialize the layer info, including params)_, into a LayerConf message.
-  virtual std::string ToProto(LayerConf* param) const = 0;
+  virtual std::string ToProto(LayerConf* conf) const {
+    conf->set_name(name_);
+  }
 
   /// Serialize the layer info, including params_, into a string representing
   /// a LayerParameter message.
-  /*
-  std::string ToProtoStr() const {
-    std:: string str;
-    SerializeToString(&str);
-  }
-  */
+  std::string ToProtoStr() const;
 
   /// Return all Param instances of this layer.
-  const vector<void*> params() const { return params_; }
+  /// Each layer could cache the Param objects.
+  /// To save memory of , it can also create it when this function
+  /// is called
+  const vector<Param*> GetParam();
 
   /// Each layer instance would optionally have a name.
   /// Used for debugging and logging.
   const std::string name() const { return name_; }
 
-
  protected:
   std::string name_;
-  std::vector<void*> params_;
 };
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/include/singa/model/param.h
----------------------------------------------------------------------
diff --git a/include/singa/model/param.h b/include/singa/model/param.h
new file mode 100644
index 0000000..b859b1c
--- /dev/null
+++ b/include/singa/model/param.h
@@ -0,0 +1,97 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_MODEL_PARAM_H_
+#define SINGA_MODEL_PARAM_H_
+#include "singa/core/tensor.h"
+#include <vector>
+#include <string>
+using std::vector;
+using std::string;
+namespace singa {
+/// Base Param class for storing set of parameters, e.g., a weight matrix or a
+/// bias vector.
+/// It includes multiple Tensor s for parameter values, gradients, etc.
+class Param {
+ public:
+  ~Param();
+  Param(const ParamSpec& conf);
+  Param(Param&& p);
+  Param(const Param& p);
+  void operator=(Param&& p);
+  void operator=(const Param& p);
+
+  Tensor& value() {
+    return value_;
+  }
+
+  Tensor& grad() {
+    return grad_;
+  }
+
+  void set_value(const Tensor& t) {
+    value_ = t;
+  }
+
+  void set_value(Tensor&& t) {
+    value_ = std::move(t);
+  }
+
+  void set_grad(const Tensor& t) {
+    isGradValid_ = true;
+    grad_ = t;
+  }
+
+  void set_grad(Tensor&& t) {
+    grad_ = std::move(t);
+  }
+
+  // void Compress();
+  // string ToString();
+
+ protected:
+  string name_;
+  Tensor value_;
+  float lr_mult_ = 1.0f, decay_mult_ = 1.0f;
+};
+
+class ParamGrad {
+// return grad tensor or data to recover the grad tensor, e.g., if W = U * V
+// then, ParamGrad could just store U and V. provide func for serailize and
+// deserialize.
+};
+
+// updater just copy the ParamGrad to a device and submit ops to that device, e.g.,
+// add grad; check update_condidtion; apply sgd; copy back.
+// consider rpc (no rmda).
+
+Param* CreateParam(string type) {
+  Param* p = nullptr;
+  if (type == "default")
+    p = new Param();
+  else
+    LOG(FATAL) << "Currently param type " << type << " is not implemented."
+               << "Pls use the 'default' type";
+  return p;
+}
+#endif  // SINGA_MODEL_PARAM_H_
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 5bdab6f..4976a32 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -49,6 +49,7 @@ void Device::FreeBlob(Blob* blob) {
 
 void Device::CopyData(Blob* dst, const Blob& src, int len, int dst_offset,
                       int src_offset) {
+
   memcpy(reinterpret_cast<Byte*>(dst->mutable_data()) + dst_offset,
          (const Byte*)src.data() + src_offset, len);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/cpp_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/cpp_math.cc b/src/core/math/cpp_math.cc
deleted file mode 100644
index 638d693..0000000
--- a/src/core/math/cpp_math.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-#include "singa/core/common.h"
-
-#ifdef USE_CBLAS
-#include <cblas.h>
-#endif
-
-namespace singa {
-template<>
-void Add<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     Blob* ret,
-                     Context* ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float*>(ret->mutable_data());
-  const float *lptr = static_cast<const float*>(lhs->data());
-  const float *rptr = static_cast<const float*>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + rptr[i];
-  }
-}
-
-#ifdef USE_CBLAS
-template<>
-void Dot<float, lib::Cpp>(int count,
-                     const Blob* lhs,
-                     const Blob* rhs,
-                     float* ret,
-                     Context* ctx) {
-  float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
-  *ret = cblas_sdot(count, lptr, 1, rptr, 1);
-}
-
-#endif
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/cuda_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/cuda_math.cc b/src/core/math/cuda_math.cc
deleted file mode 100644
index 1cff1c2..0000000
--- a/src/core/math/cuda_math.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-#include "singa/core/common.h"
-
-
-namespace singa {
-
-#ifdef USE_CUDA
-template<>
-void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
-                        Blob* ret, Context* ctx) {
-  cublasSetStream(ctx->handle, ctx->stream);
-  cublasScopy(ctx->handle, count, lhs->data(), 1, ret->mutable_data(), 1);
-  cublasSaxpy(ctx->handle, 1.0f, rhs->data(), 1, ret->mutable_data(), 1);
-}
-
-#ifdef USE_CUDNN
-template<>
-void Conv<float, lib::Cudnn>(const OpConf *conf,
-          const Blob* input,
-          const Blob* W,
-          const Blob* b,
-          Blob* ret,
-          Context* ctx) {
-  // auto conv_conf = conf->CastTo<ConvConf>();
-  // conv op
-}
-
-#endif
-#endif
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/math/opencl_math.cc
----------------------------------------------------------------------
diff --git a/src/core/math/opencl_math.cc b/src/core/math/opencl_math.cc
deleted file mode 100644
index 7012610..0000000
--- a/src/core/math/opencl_math.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "singa/core/math.h"
-
-namespace singa {
-
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 8fdc2ed..51b785e 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -15,28 +15,42 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "singa/core/tensor.h"
-#include "singa/core/math.h"
+#include "./tensor_math.h"
+#include "./tensor_math_cpp.h"
+#include "./tensor_math_cuda.h"
+#include "./tensor_math_opencl.h"
 
 namespace singa {
+
 Tensor::~Tensor() {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
+Tensor::Tensor() {
+  device_ = &hostDeviceSingleton;
+}
+
 Tensor::Tensor(const Shape& shape, DataType dtype)
     : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
   device_ = &hostDeviceSingleton;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-
+Tensor::Tensor(Shape&& shape, DataType dtype)
+    : data_type_(dtype), device_(&hostDeviceSingleton), shape_(shape) {
+  device_ = &hostDeviceSingleton;
+  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+}
 Tensor::Tensor(const Shape& shape, Device* device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-
+Tensor::Tensor(Shape&& shape, Device* device, DataType dtype)
+    : data_type_(dtype), device_(device), shape_(shape) {
+  blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+}
 Tensor::Tensor(const Tensor& t)
     : transpose_(t.transpose_),
       data_type_(t.data_type_),
@@ -50,7 +64,7 @@ Tensor::Tensor(Tensor&& t)
     : transpose_(t.transpose_),
       data_type_(t.data_type_),
       device_(t.device_),
-      shape_(t.shape_) {
+      shape_(std::move(t.shape_)) {
   blob_ = t.blob_;
   t.blob_ = nullptr;
 }
@@ -90,18 +104,26 @@ void Tensor::ToHost() {
   ToDevice(device_->host());
 }
 
-void Tensor::CopyDataFromHostPtr(const void* src, size_t size) {
+template<typename DType>
+void Tensor::CopyDataFromHostPtr(const DType* src, int num) {
+  CHECK_EQ(sizeof(DType), SizeOf(data_type_)) << "data_type is "
+                                              << DataType_Name(data_type_)
+                                              << " user given type is of size "
+                                              << sizeof(DType);
   if (src != nullptr)
-    device_->CopyDataFromHostPtr(blob(), src, size);
+    device_->CopyDataFromHostPtr(blob(), src, sizeof(DType) * num);
   else
     LOG(WARNING) << "Copy data from null host ptr";
 }
+template void Tensor::CopyDataFromHostPtr(const float* src, int num);
 
 void Tensor::CopyData(const Tensor& src) {
   CHECK_EQ(Size(), src.Size());
+  CHECK(blob_ != nullptr);
   // Do copy only if the src's blob is already initialized.
-  if (src.blob_ != nullptr)
-    singa::CopyData(this, src, Size() * SizeOf(data_type_), 0, 0);
+  if (src.blob_ != nullptr) {
+    singa::CopyData(this, src, Size(), 0, 0);
+  }
 }
 
 Tensor Tensor::Clone() {
@@ -112,8 +134,10 @@ Tensor Tensor::Clone() {
 }
 
 Tensor Tensor::T() const {
+  CHECK_EQ(shape_.size(), 2);
   Tensor t(*this);
   t.transpose_ = ~transpose_;
+  std::swap(shape_[0], shape_[1]);
   return t;
 }
 
@@ -132,80 +156,315 @@ void Tensor::operator=(Tensor&& t) {
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
-  shape_ = t.shape_;
+  shape_ = std::move(t.shape_);
   device_ = t.device_;
   blob_ = t.blob_;
   t.blob_ = nullptr;
 }
 
-void Tensor::operator+=(const Tensor& t) {
-  Add(*this, t, this);
-}
-// ====================Tensor Operations=======================================
+#define GenUnaryTensorArgMemberFunction(op, fn) \
+  void Tensor::op(const Tensor& t) { fn(*this, t, this); }
+
+GenUnaryTensorArgMemberFunction(operator+=, Add);
+GenUnaryTensorArgMemberFunction(operator-=, Sub);
+GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult);
+GenUnaryTensorArgMemberFunction(operator/=, Div);
+
+#define GenUnaryScalarArgMemberFunction(op, fn) \
+  template <typename DType>                     \
+  void Tensor::op(DType x) {                    \
+    fn(*this, x, this);                         \
+  }                                             \
+  template void Tensor::op<float>(float x)
+
+GenUnaryScalarArgMemberFunction(operator-=, Sub);
+GenUnaryScalarArgMemberFunction(operator+=, Add);
+GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
+GenUnaryScalarArgMemberFunction(operator/=, Div);
 
+// ====================Tensor Operations=======================================
 void CopyData(Tensor* dst,
               const Tensor& src,
-              int len,
+              int num,
               int dst_offset,
               int src_offset) {
-  CHECK_GE(src.MemSize(), src_offset + len);
-  CHECK_GE(dst->MemSize(), dst_offset + len);
+  CHECK_GE(src.Size(), src_offset + num);
+  CHECK_GE(dst->Size(), dst_offset + num);
+  int width = SizeOf(src.data_type());
+  CHECK_EQ(width, SizeOf(dst->data_type()));
+  CopyRawData(dst, src, num * width, dst_offset * width, src_offset * width);
+}
+
+void CopyRawData(Tensor* dst,
+              const Tensor& src,
+              int nBytes,
+              int dst_offset,
+              int src_offset) {
+  CHECK_GE(src.MemSize(), src_offset + nBytes);
+  CHECK_GE(dst->MemSize(), dst_offset + nBytes);
   Device* src_dev = src.device(), *dst_dev = dst->device();
   Blob* src_blob = src.blob(), *dst_blob = dst->blob();
   if (dst_dev->device_lib() != src_dev->device_lib()) {
     // let the none cpp device conduct copy op
     if (dst_dev->device_lib() == kCpp) {
-      src_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+      src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
     } else if (src_dev->device_lib() == kCpp) {
-      dst_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+      dst_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
-    src_dev->CopyData(dst_blob, *src_blob, len, dst_offset, src_offset);
+    src_dev->CopyData(dst_blob, *src_blob, nBytes, dst_offset, src_offset);
   }
 }
+//============================================================================
+/// typedef DType accroding to type value.
+/// DType would be used in the code block __VA_ARGS__.
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
+  } while (0)
+
+/// typedef DType and Lib according to values of type and lib respectively.
+/// type is from DataType, and lib is from LibType.
+/// DType and Lib would be used in __VA_ARGS__.
+#define TYPE_LIB_SWITCH(dtype, DType, ltype, Lib, ...)        \
+  do {                                                        \
+    const int _SwitchShift = 3;                               \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);    \
+    switch (_SwitchHash) {                                    \
+      case ((kFloat32 << _SwitchShift) + kCuda): {            \
+        typedef float DType;                                  \
+        typedef lib::Cuda Lib;                                \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kCudnn): {           \
+        typedef float DType;                                  \
+        typedef lib::Cudnn Lib;                               \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kCpp): {             \
+        typedef float DType;                                  \
+        typedef lib::Cpp Lib;                                 \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {          \
+        typedef float DType;                                  \
+        typedef lib::Opencl Lib;                              \
+        { __VA_ARGS__ }                                       \
+        break;                                                \
+      }                                                       \
+      default:                                                \
+        LOG(FATAL) << "Unknown combination of data type "     \
+                   << DataType_Name(dtype) << " and library " \
+                   << LibType_Name(ltype);                    \
+    }                                                         \
+  } while (0)
+
+
+#define EltwiseUnaryTensorFn(fn, t, ret)                                   \
+  do {                                                                     \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, { \
+      ret->device()->Submit(                                               \
+          [t, ret](Context* ctx) {                                         \
+            fn<DType, Lib>(t.Size(), t.blob(), ret->blob(), ctx);          \
+          },                                                               \
+          {t.blob()}, {ret->blob()});                                      \
+    });                                                                    \
+  } while (0)
+
+#define GenUnaryTensorFunction(fn)                    \
+  Tensor fn(const Tensor& t) {                        \
+    Tensor ret(t.shape(), t.device(), t.data_type()); \
+    auto* retptr = &ret;                              \
+    EltwiseUnaryTensorFn(fn, t, retptr);              \
+    return ret;                                       \
+  }
+
+GenUnaryTensorFunction(Abs);
+GenUnaryTensorFunction(Exp);
+GenUnaryTensorFunction(Log);
+GenUnaryTensorFunction(ReLU);
+GenUnaryTensorFunction(Sigmoid);
+GenUnaryTensorFunction(Sign);
+GenUnaryTensorFunction(Sqrt);
+GenUnaryTensorFunction(Tanh);
 
-Tensor operator+(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device());
-  Add(lhs, rhs, &ret);
+Tensor Softmax(const Tensor& t, int axis) {
+  Tensor ret(t.shape(), t.device(), t.data_type());
+  Softmax(t, &ret, axis);
   return ret;
 }
 
-void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
-  TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, {
+void Softmax(const Tensor& t, Tensor* ret, int axis) {
+  int nrow = 1, ncol = t.Size(), size = ncol;
+  CHECK_GE(axis, -1);
+  CHECK_GT(t.shape().size(), 0);
+  if (axis > -1) {
+    nrow = Product(t.shape().begin(), t.shape().begin() + axis + 1);
+    CHECK_EQ(size % nrow, 0) << "Size = " << size << " nrow = " << nrow;
+    ncol = size / nrow;
+  }
+  TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {
     ret->device()->Submit(
-        [lhs, rhs, ret](Context* ctx) {
-          Add<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx);
+        [nrow, ncol, t, ret](Context* ctx) {
+          Softmax<DType, Lib>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
-        {lhs.blob(), rhs.blob()}, {ret->blob()});
-  });
+        {t.blob()}, {ret->blob()});
+    });
 }
-/*
-Tensor operator-(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device());
-  Sub(lhs, rhs, &ret);
+
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, { \
+      ret->device()->Submit(                                                   \
+          CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                    \
+          [lhs, rhs, ret](Context* ctx) {                                      \
+            fn<DType, Lib>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),    \
+                           ctx);                                               \
+          },                                                                   \
+          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
+    });                                                                        \
+  } while (0)
+
+#define GenBinaryTensorFunction(op, fn)                        \
+  Tensor op(const Tensor& lhs, const Tensor& rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+  }
+
+GenBinaryTensorFunction(operator+, Add);
+GenBinaryTensorFunction(operator-, Sub);
+GenBinaryTensorFunction(operator*, EltwiseMult);
+GenBinaryTensorFunction(operator/, Div);
+GenBinaryTensorFunction(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret)                                \
+  do {                                                                      \
+    TYPE_LIB_SWITCH(t.data_type(), DType, t.device()->device_lib(), Lib, {  \
+      ret->device()->Submit(                                                \
+          static_assert(typeid(x) == typeid(DType),                         \
+                        "The Scalar type must match the Tensor data type"); \
+          [t, x, ret](Context* ctx) {                                       \
+            fn<DType, Lib>(t.Size(), t.blob(), x, ret->blob(), ctx);        \
+          },                                                                \
+          {t.blob()}, {ret->blob()});                                       \
+    });                                                                     \
+  } while (0)
+
+#define GenTensorScalarFunction(op, fn)                \
+  template <typename DType>                                \
+  Tensor op(const Tensor& t, DType x) {                    \
+    Tensor ret(t.shape(), t.device(), t.data_type());  \
+    fn(t, x, &ret);                                    \
+    return ret;                                        \
+  }                                                    \
+  template <typename DType>                                \
+  void fn(const Tensor& t, DType x, Tensor* ret) {   \
+    EltwiseTensorScalarFn(fn, t, x, ret);              \
+  }                                                    \
+  template Tensor op<float>(const Tensor& t, float x); \
+  template void fn<float>(const Tensor& t, const float x, Tensor* ret)
+
+GenTensorScalarFunction(operator+, Add);
+GenTensorScalarFunction(operator-, Sub);
+GenTensorScalarFunction(operator*, EltwiseMult);
+GenTensorScalarFunction(operator/, Div);
+GenTensorScalarFunction(Pow, Pow);
+
+// ================Blas operations============================================
+template <typename DType>
+Tensor Mult(const Tensor& lhs, const Tensor& rhs) {
+  Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());
+  Mult<DType>(lhs, rhs, &ret);
+  return ret;
+}
+template Tensor Mult<float>(const Tensor& lhs, const Tensor& rhs);
+
+template <typename DType>
+void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
+  Mult(DType(1), lhs, DType(1), rhs, ret);
+}
+template void Mult<float>(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+
+template <typename DType>
+Tensor Mult(DType alpha, const Tensor& A, DType beta, const Tensor& B) {
+  Tensor ret(A.shape(), A.device(), A.data_type());
+  Mult<DType>(alpha, A, beta, B, &ret);
   return ret;
 }
+template Tensor Mult<float>(float alpha, const Tensor& lhs, float beta,
+    const Tensor& rhs);
 
-void Sub(const Tensor& lhs, const Tensor& rhs, Tensor *ret) {
-  TYPE_LIB_SWITCH(lhs.data_type(), DType, lhs.device()->device_lib(), Lib, {
-      ret->device()->Submit(
-        [lhs, rhs, ret](Context* ctx) {
-          Sub<DType, Lib>(
-            lhs.Size(),
-            lhs.blob(),
-            rhs.blob(),
-            ret->blob(),
-            ctx);}
-        , {lhs.blob(), rhs.blob()}, {ret->blob()});
+template <typename SType>
+void Mult(SType alpha, const Tensor& A, SType beta, const Tensor& B, Tensor* C)
+{
+  CHECK_EQ(A.shape().size(), 2);
+  bool transA = A.transpose();
+  int m = transA ? A.shape()[1] : A.shape()[0], n = 0;
+  if (B.shape().size() == 1) {
+    n = C->Size();
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
+      static_assert(std::is_same<SType, DType>::value,
+        "The scalar type must be the same as the tensor data type");
+      C->device()->Submit(
+        [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
+        GEMV<DType, Lib>(transA, m, n, alpha, A.blob(),
+          B.blob(), beta, C->blob(), ctx);
+        },
+        {A.blob(), B.blob()}, {C->blob()});
       });
+  } else {
+    CHECK(!C->transpose());
+    bool transB = B.transpose();
+    int k = transB ? B.shape()[1] : B.shape()[0];
+    n = C->shape()[1];
+    CHECK_EQ(C->shape()[0], m);
+    CHECK_EQ(A.Size(), m * k);
+    CHECK_EQ(B.Size(), n * k);
+    TYPE_LIB_SWITCH(A.data_type(), DType, A.device()->device_lib(), Lib, {
+        static_assert(std::is_same<SType, DType>::value,
+          "The scalar type must be the same as the tensor data type");
+        C->device()->Submit(
+          [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
+          GEMM<DType, Lib>(transA, transB, m, n, k, alpha, A.blob(),
+            B.blob(), beta, C->blob(), ctx);
+          },
+          {A.blob(), B.blob()}, {C->blob()});
+        });
+  }
 }
+template void Mult<float>(float alpha, const Tensor& lhs, float beta,
+    const Tensor& rhs, Tensor* ret);
 
-// ================Blas operations============================================
 
 // ================Neural Net operations======================================
-
+/*
 void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
           const Tensor& b, Tensor* ret) {
   TYPE_LIB_SWITCH(input.data_type(), DType, input.device()->nn_lib(), Lib, {
@@ -218,5 +477,33 @@ void Conv(const OpConf* conf, const Tensor& input, const Tensor& W,
   });
 }
 */
+void Bernoulli(float threshold, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [threshold, t](Context* ctx) {
+          Bernoulli<DType, Lib>(t->Size(), threshold, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
+
+void Uniform(float low, float high, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [low, high, t](Context* ctx) {
+          Uniform<DType, Lib>(t->Size(), low, high, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
 
+void Gaussian(float mean, float std, Tensor* t) {
+  TYPE_LIB_SWITCH(t->data_type(), DType, t->device()->nn_lib(), Lib, {
+    t->device()->Submit(
+        [mean, std, t](Context* ctx) {
+          Gaussian<DType, Lib>(t->Size(), mean, std, t->blob(), ctx);
+        },
+        {}, {t->blob()});
+  });
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
new file mode 100644
index 0000000..a4f68e3
--- /dev/null
+++ b/src/core/tensor/tensor_math.h
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_MATH_H_
+#define SINGA_CORE_MATH_H_
+#include <type_traits>
+#include "singa/core/common.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+
+/// \file math.h Math functions for linear algebra, neural net and random
+/// operations.
+/// All functions have a template argument, DType for DataType, Lib for the
+/// backend library, e.g., lib::Cublas, lib::Cudnn, etc.
+
+/// Some operations would have many config/hyper-parameters, e.g., Conv, and
+/// these config vary among diff implementations, e.g., cuda/cudnn/opencl.
+/// To separate the modules, we pass a OpConf pointer to the Tensor Op function.
+/// The specific fields are implemented by inheriting OpConf, and casting the
+/// pointer between the base and the sub-class.
+class OpConf {
+ public:
+  template <typename T>
+  T* CastTo() {
+    static_assert(std::is_base_of<OpConf, T>::value,
+                  "The cast type must be a sub-class of OpConf");
+    return static_cast<T*>(this);
+  }
+};
+
+// ================Linear algebra functions====================================
+/// ret[i] = |input[i]|
+template <typename DType, typename Lib>
+void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// sum all elements of input into ret
+template <typename DType, typename Lib>
+void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret[i] = sign(input[i])
+template <typename DType, typename Lib>
+void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Base is e, Neper number. ret[i]=exp(input[i])
+template <typename DType, typename Lib>
+void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
+template <typename DType, typename Lib>
+void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, ret[i]=sqrt([input[i])
+template <typename DType, typename Lib>
+void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, ret[i]=tanh([input[i])
+template <typename DType, typename Lib>
+void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Element-wise operation, ret[i]=max(0, input[i])
+template <typename DType, typename Lib>
+void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Element-wise operation, ret[i]=sigmoid([input[i])
+template <typename DType, typename Lib>
+void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, do v^x for every v from the input tensor
+template <typename DType, typename Lib>
+void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+template <typename DType, typename Lib>
+void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <typename DType, typename Lib>
+void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
+           Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = input + x
+template <typename DType, typename Lib>
+void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret =  input - x
+template <typename DType, typename Lib>
+void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  Add<DType, Lib>(count, input, -x, ret, ctx);
+}
+/// ret = input * x
+template <typename DType, typename Lib>
+void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* ctx)
+{
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret = input / x
+template <typename DType, typename Lib>
+void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+  EltwiseMult<DType, Lib>(count, input, DType(1) / x, ret, ctx);
+}
+
+/// ret = lhs + rhs
+template <typename DType, typename Lib>
+void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs - rhs
+template <typename DType, typename Lib>
+void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs * rhs
+template <typename DType, typename Lib>
+void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
+          Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = lhs / rhs
+template <typename DType, typename Lib>
+void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// outer-product.
+/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+template <typename DType, typename Lib>
+void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
+           Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lib>
+void SumRow(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Sum the rows of the input matrix into a vector
+template <typename DType, typename Lib>
+void SumCol(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of ret
+template <typename DType, typename Lib>
+void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// Add the vector v to every column of A as the column of ret
+template <typename DType, typename Lib>
+void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
+            Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// ===== Level 1
+/// return the index of the element with the max value.
+template <typename DType, typename Lib>
+void Amax(int count, const Blob* input, int* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// return the index of the element with the min value.
+template <typename DType, typename Lib>
+void Amin(int count, const Blob* input, int* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// ret = sum |x| for all x in input
+template <typename DType, typename Lib>
+void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret = alpha * input + ret
+template <typename DType, typename Lib>
+void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// ret *= x
+template <typename DType, typename Lib>
+void Scale(int count, DType x, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
+         Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== Level 2
+/// ret = alpha * op(A) * v + beta * ret.
+/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+template <typename DType, typename Lib>
+void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
+          DType beta, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ===== Level 3
+/// ret = alpha * op(A) * op(B) + beta * ret.
+/// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n.
+template <typename DType, typename Lib>
+void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
+          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lib>
+void Bernoulli(int count, float threshold, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lib>
+void Uniform(int count, float low, float high, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
+template <typename DType, typename Lib>
+void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ================Neural net functions=======================================
+template <typename DType, typename Lib>
+void ConvFwd(ConvConf* conf, const Blob* x, const Blob* w, Blob* y,
+             Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void ConvBwdBias(const ConvConf* conf, const Blob* dy, Blob* db, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void PoolFwd(const PoolConf* conf, const Blob* x, Blob* y, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lib>
+void PoolBwd(const PoolConf* conf, const Blob* y, const Blob* dy, const Blob* x,
+             Blob* dx, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+}  // namespace singa
+
+#endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
new file mode 100644
index 0000000..a953085
--- /dev/null
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+#define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+#include "./tensor_math.h"
+#include "singa/core/common.h"
+
+#ifdef USE_CBLAS
+#include <cblas.h>
+#endif
+
+namespace singa {
+template<>
+void Add<float, lib::Cpp>(int count,
+                     const Blob* lhs,
+                     const Blob* rhs,
+                     Blob* ret,
+                     Context* ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *dptr = static_cast<float*>(ret->mutable_data());
+  const float *lptr = static_cast<const float*>(lhs->data());
+  const float *rptr = static_cast<const float*>(rhs->data());
+  for (int i = 0; i < count; i++) {
+    dptr[i] = lptr[i] + rptr[i];
+  }
+}
+
+#ifdef USE_CBLAS
+template<>
+void Dot<float, lib::Cpp>(int count,
+                     const Blob* lhs,
+                     const Blob* rhs,
+                     float* ret,
+                     Context* ctx) {
+  float dptr = ret->mutable_data(), lptr = lhs->data(), rptr = rhs->data();
+  *ret = cblas_sdot(count, lptr, 1, rptr, 1);
+}
+
+#endif
+}
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
new file mode 100644
index 0000000..e1c72d8
--- /dev/null
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#include "./tensor_math.h"
+#include "singa/core/common.h"
+
+
+namespace singa {
+
+#ifdef USE_CUDA
+template<>
+void Add<float, lib::Cuda>(int count, const Blob* lhs, const Blob* rhs,
+                        Blob* ret, Context* ctx) {
+  cublasSetStream(ctx->handle, ctx->stream);
+  cublasScopy(ctx->handle, count, lhs->data(), 1, ret->mutable_data(), 1);
+  cublasSaxpy(ctx->handle, 1.0f, rhs->data(), 1, ret->mutable_data(), 1);
+}
+
+#ifdef USE_CUDNN
+template<>
+void Conv<float, lib::Cudnn>(const OpConf *conf,
+          const Blob* input,
+          const Blob* W,
+          const Blob* b,
+          Blob* ret,
+          Context* ctx) {
+  // auto conv_conf = conf->CastTo<ConvConf>();
+  // conv op
+}
+
+#endif
+#endif
+}
+
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h
new file mode 100644
index 0000000..c4b1347
--- /dev/null
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
+#include "./tensor_math.h"
+
+namespace singa {
+
+
+}
+
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/model/layer/layer.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/layer.cc b/src/model/layer/layer.cc
index 1f0e34d..0e83cde 100644
--- a/src/model/layer/layer.cc
+++ b/src/model/layer/layer.cc
@@ -18,5 +18,13 @@
 #include "singa/model/layer.h"
 
 namespace singa {
+const vector<Tensor> ComputeFeature(int flag, const vector<Tensor>& input) {
+  const vector<Blob*> input_blobs;
 
+}
+
+void ComputeFeature(int flag, const vector<Tensor>& input) {
+  const vector<Blob*> input_blobs;
+
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/src/proto/layer.proto
----------------------------------------------------------------------
diff --git a/src/proto/layer.proto b/src/proto/layer.proto
index bb87af9..0fbbb5d 100644
--- a/src/proto/layer.proto
+++ b/src/proto/layer.proto
@@ -97,6 +97,10 @@ message ParamSpec {
 
   // The multiplier on the global weight decay for this parameter.
   optional float decay_mult = 4 [default = 1.0];
+
+  // SINGA field for creating diff Param, e.g. SparseParam or CompressableParam
+  // Curently only have a default param implementation.
+  optional string type = 20 [default = "default"];
 }
 
 // NOTE
@@ -154,27 +158,27 @@ message LayerConf {
   optional ConcatConf concat_conf = 104;
   optional ContrastiveLossConf contrastive_loss_conf = 105;
   optional ConvolutionConf convolution_conf = 106;
-  optional DataConf data_conf = 107;
+  // optional DataConf data_conf = 107;
   optional DropoutConf dropout_conf = 108;
-  optional DummyDataConf dummy_data_conf = 109;
+  // optional DummyDataConf dummy_data_conf = 109;
   optional EltwiseConf eltwise_conf = 110;
   optional EmbedConf embed_conf = 137;
   optional ExpConf exp_conf = 111;
   optional FlattenConf flatten_conf = 135;
-  optional HDF5DataConf hdf5_data_conf = 112;
-  optional HDF5OutputConf hdf5_output_conf = 113;
+  // optional HDF5DataConf hdf5_data_conf = 112;
+  // optional HDF5OutputConf hdf5_output_conf = 113;
   optional HingeLossConf hinge_loss_conf = 114;
-  optional ImageDataConf image_data_conf = 115;
+  // optional ImageDataConf image_data_conf = 115;
   optional InfogainLossConf infogain_loss_conf = 116;
   optional InnerProductConf inner_product_conf = 117;
   optional LogConf log_conf = 134;
   optional LRNConf lrn_conf = 118;
-  optional MemoryDataConf memory_data_conf = 119;
+  // optional MemoryDataConf memory_data_conf = 119;
   optional MVNConf mvn_conf = 120;
   optional PoolingConf pooling_conf = 121;
   optional PowerConf power_conf = 122;
   optional PReLUConf prelu_conf = 131;
-  optional PythonConf python_conf = 130;
+  // optional PythonConf python_conf = 130;
   optional ReductionConf reduction_conf = 136;
   optional ReLUConf relu_conf = 123;
   optional ReshapeConf reshape_conf = 133;
@@ -185,7 +189,7 @@ message LayerConf {
   optional TanHConf tanh_conf = 127;
   optional ThresholdConf threshold_conf = 128;
   optional TileConf tile_conf = 138;
-  optional WindowDataConf window_data_conf = 129;
+  //optional WindowDataConf window_data_conf = 129;
 }
 
 // Message that stores hyper-parameters used to apply transformation
@@ -835,7 +839,7 @@ message PReLUConf {
   // Surpassing Human-Level Performance on ImageNet Classification, 2015.
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
-  optional FillerParameter filler = 1;
+  optional FillerConf filler = 1;
   // Whether or not slope paramters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_cpp_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cpp_math.cc b/test/singa/test_cpp_math.cc
index 268785d..78c713f 100644
--- a/test/singa/test_cpp_math.cc
+++ b/test/singa/test_cpp_math.cc
@@ -20,8 +20,6 @@
 *************************************************************/
 
 #include "gtest/gtest.h"
-#include "singa/core/math.h"
+#include "../src/core/tensor/tensor_math_cpp.h"
 
-TEST(CppMath, Add) {
 
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index 04068ae..86200a8 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -15,7 +15,7 @@ TEST(TensorTest, TestConstructor) {
 
   EXPECT_NE(float_t.device(), nullptr);
 
-  singa::Tensor float16_t(singa::Shape{2,3}, singa::kFloat16);
+  singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
   EXPECT_EQ(singa::kFloat16, float16_t.data_type());
   EXPECT_EQ(6, float16_t.Size());
   EXPECT_EQ(12, float16_t.blob()->size());
@@ -68,7 +68,7 @@ TEST(TensorClass, ToDevice) {
 TEST(TensorClass, CopyDataFromHostPtr) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
   const float* dptr = static_cast<const float*>(t.blob()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(2.0f, dptr[1]);
@@ -78,7 +78,7 @@ TEST(TensorClass, CopyDataFromHostPtr) {
 TEST(TensorClass, CopyData) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
 
   Tensor o(Shape{3});
   o.CopyData(t);
@@ -91,7 +91,7 @@ TEST(TensorClass, CopyData) {
 TEST(TensorClass, Clone) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
+  t.CopyDataFromHostPtr(data, 3);
 
   Tensor o = t.Clone();
   const float* dptr = static_cast<const float*>(o.blob()->data());
@@ -110,30 +110,5 @@ TEST(TensorClass, T) {
   EXPECT_TRUE((t.shape() ==  o.shape()));
 }
 
-TEST(TensorClass, Add) {
-  const float data[] = {1.0f, 2.0f, 3.0f, 1.1f, 2.1f, 3.1f};
-  Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(data, sizeof(float) * 3);
 
-  Tensor o = t.Clone();
-  o += t;
-  const float* dptr = o.data<float>();
-  EXPECT_FLOAT_EQ(2.0f, dptr[0]);
-  EXPECT_FLOAT_EQ(4.0f, dptr[1]);
-  EXPECT_FLOAT_EQ(6.0f, dptr[2]);
-
-  Tensor p(Shape{3});
-  o += p;
-  const float* dptr1 = o.data<float>();
-  EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
-  EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
-  EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
-
-  Tensor q(Shape{3});
-  q.CopyDataFromHostPtr(data + 3, sizeof(float) * 3);
-  t += q;
-  const float* dptr2 = t.data<float>();
-  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
-  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
-  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
-}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/02851fac/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
new file mode 100644
index 0000000..51e7cfb
--- /dev/null
+++ b/test/singa/test_tensor_math.cc
@@ -0,0 +1,84 @@
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+using singa::Tensor;
+using singa::Shape;
+using singa::Device;
+
+class TestTensorMath : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    const float dat1[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    const float dat2[] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
+    a.ReShape(singa::Shape{6});
+    b.ReShape(singa::Shape{6});
+    c.ReShape(singa::Shape{6, 1});
+    d.ReShape(singa::Shape{3, 2});
+
+    a.CopyDataFromHostPtr<float>(dat1, 6);
+    b.CopyDataFromHostPtr<float>(dat2, 6);
+  }
+  Tensor a, b, c, d;
+};
+
+TEST_F(TestTensorMath, MemberAddTensor) {
+  Tensor aa = a.Clone();
+  aa += a;
+  const float* dptr = aa.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr[2]);
+
+  // check p is initialized to 0
+  Tensor p(Shape{6});
+  p += aa;
+  const float* dptr1 = p.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
+
+  a += b;
+  const float* dptr2 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+}
+/*
+TEST(TensorClass, SubTensor) {
+  Tensor a(Shape{2,3}), b(Shape{6});
+  float x[]={1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  float y[]={1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
+  a.CopyDataFromHostPtr(x, 6);
+  b.CopyDataFromHostPtr(y, 6);
+  b -= a;
+  const float* dptr = b.data<float>();
+  EXPECT_FLOAT_EQ(0.1f, dptr[0]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[1]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[2]);
+  EXPECT_FLOAT_EQ(0.1f, dptr[5]);
+}
+*/
+
+TEST_F(TestTensorMath, AddTensors) {
+  Tensor ret(a.shape(), a.device(), a.data_type());
+  Add(a, b, &ret);
+  const float* dptr = ret.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr[5]);
+
+  const Tensor d = a + b;
+  const float* dptr2 = d.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+
+  Add(a, b, &a);
+  const float* dptr1 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr1[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr1[5]);
+}