You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:20:29 UTC

[36/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

SINGA-182 Clean math function APIs and implementations

Clean tensor.h/.cc and tensor_math.h, tensor_math_cpp.h:
re-order the functions by (type, name), where type is a) element-wise
function b) matrix function c) random function d) blas function

Implement GEMV using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/564c88ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/564c88ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/564c88ad

Branch: refs/heads/master
Commit: 564c88ad95e976e6067198c832f4fcd9a8878cd7
Parents: 07c49da
Author: wangwei <wa...@gmail.com>
Authored: Fri Jun 10 23:12:09 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        | 396 +++++++++---------
 src/core/tensor/tensor.cc          | 688 ++++++++++++++++----------------
 src/core/tensor/tensor_math.h      | 336 ++++++++--------
 src/core/tensor/tensor_math_cpp.h  | 640 +++++++++++++++--------------
 src/core/tensor/tensor_math_cuda.h | 158 ++++----
 test/singa/test_tensor_math.cc     |  15 +-
 6 files changed, 1131 insertions(+), 1102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..82bbe81 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -32,17 +32,6 @@ using std::tuple;
 namespace singa {
 
 typedef vector<size_t> Shape;
-typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
-  if (len == 0)
-    len = shape.size();
-  CHECK_LE(len, shape.size());
-  size_t v = 1;
-  for (unsigned int i = start; i < len; i++)
-    v *= shape[i];
-  return v;
-}
-
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
                              sizeof(char), sizeof(double)};
@@ -65,10 +54,10 @@ class Tensor {
  public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
-  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
-  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+  explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
+  Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
+  Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
@@ -82,10 +71,10 @@ class Tensor {
 
   Device *device() const { return device_; }
 
-  /// Return immutable Tensor values with given type.
-  template <typename DType>
-  DType data() const {
-    return static_cast<DType>(blob()->data());
+  /// return immutable Tensor values with given type.
+  template <typename SType>
+  SType data() const {
+    return static_cast<SType>(blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -93,7 +82,7 @@ class Tensor {
 
   const Shape &shape() const { return shape_; }
 
-  const size_t shape(size_t idx) const {
+  const size_t shape(const size_t idx) const {
     CHECK_LT(idx, shape_.size());
     return shape_.at(idx);
   }
@@ -102,13 +91,13 @@ class Tensor {
 
   bool transpose() const { return transpose_; }
 
-  /// Return number of total elements
+  /// return number of total elements
   size_t Size() const {
     CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
     return blob_->size() / SizeOf(data_type_);
   }
 
-  /// Return memory size (i.e., Bytes)
+  /// return memory size (i.e., Bytes)
   size_t MemSize() const { return blob_->size(); }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
@@ -121,7 +110,7 @@ class Tensor {
   void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate blob if type changes.
-  void AsType(DataType type);
+  void AsType(const DataType type);
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
@@ -135,14 +124,14 @@ class Tensor {
   void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
-  template <typename DType>
-  void CopyDataFromHostPtr(const DType *src, size_t num);
+  template <typename SType>
+  void CopyDataFromHostPtr(const SType *src, const size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
   void CopyData(const Tensor &other);
 
-  /// Return an exactly the same Tensor with data been deep copied.
+  /// return an exactly the same Tensor with data been deep copied.
   Tensor Clone() const;
 
   // Tensor operations
@@ -152,42 +141,37 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(const Tensor &t);
+  Tensor &operator=(const Tensor &in);
 
   /// Copy the meta info with data blob shared.
-  Tensor &operator=(Tensor &&t);
+  Tensor &operator=(Tensor &&in);
 
-  Tensor &operator+=(const Tensor &t);
-  // void operator+=(Tensor&& t);
-  Tensor &operator-=(const Tensor &t);
-  // void operator-=(Tensor&& t);
-  Tensor &operator*=(const Tensor &t);
-  // void operator*=(Tensor&& t);
-  Tensor &operator/=(const Tensor &t);
-  // void operator/=(Tensor&& t);
+  Tensor &operator+=(const Tensor &in);
+  // void operator+=(Tensor&& in);
+  Tensor &operator-=(const Tensor &in);
+  // void operator-=(Tensor&& in);
+  Tensor &operator*=(const Tensor &in);
+  // void operator*=(Tensor&& in);
+  Tensor &operator/=(const Tensor &in);
+  // void operator/=(Tensor&& in);
 
   // Scalar operations.
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator+=(DType x);
-
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator-=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator+=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator*=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator-=(const SType x);
 
-  /// T is a scalar type
-  template <typename DType>
-  Tensor &operator/=(const DType x);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator*=(const SType x);
 
-  /// save Tensor into a proto msg
-  // void ToProto(TensorProto* t);
-  /// load Tensor from proto msg
-  // void FromProto(const TensorProto& t);
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator/=(const SType x);
 
  protected:
   bool transpose_ = false;
@@ -196,14 +180,29 @@ class Tensor {
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
   Blob *blob_ = nullptr;
-  Shape shape_;
+  Shape shape_ = {};
 };
 
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+  if (len == 0) len = shape.size();
+  CHECK_LE(len, shape.size());
+  size_t v = 1;
+  for (unsigned int i = start; i < len; i++) v *= shape[i];
+  return v;
+}
+
 inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
   CHECK_EQ(in1.data_type(), in2.data_type());
   CHECK_EQ(in1.device()->lang(), in2.device()->lang());
 }
 
+template <typename FromType, typename ToType>
+ToType TypeCast(const FromType &x) {
+  // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
+  return static_cast<ToType>(x);
+}
+
 Tensor Reshape(const Tensor &in, const Shape &s);
 Tensor Reshape(const Tensor &in, Shape &&s);
 
@@ -212,192 +211,171 @@ Tensor Reshape(const Tensor &in, Shape &&s);
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t src_offset = 0, size_t dst_offset = 0);
-
-// ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor &t);
-Tensor Exp(const Tensor &t);
-Tensor Log(const Tensor &t);
-Tensor ReLU(const Tensor &t);
-Tensor Sigmoid(const Tensor &t);
-Tensor Sign(const Tensor &t);
-Tensor Sqrt(const Tensor &t);
-Tensor Square(const Tensor &t);
-Tensor Tanh(const Tensor &t);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t src_offset = 0, const size_t dst_offset = 0);
+
+// =============Element-wise operations====================================
+Tensor Abs(const Tensor &in);
+Tensor Exp(const Tensor &in);
+Tensor Log(const Tensor &in);
+Tensor ReLU(const Tensor &in);
+Tensor Sigmoid(const Tensor &in);
+Tensor Sign(const Tensor &in);
+Tensor Sqrt(const Tensor &in);
+Tensor Square(const Tensor &in);
+Tensor Tanh(const Tensor &in);
+
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+Tensor Pow(const Tensor &in, const SType x);
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+void Pow(const Tensor &in, const SType x, Tensor *out);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor &base, const Tensor &exp);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
 
+/// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
 template <typename SType>
-SType Sum(const Tensor &t);
-/// Sum elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, sum all rows into a single row
-/// if 'axis' is 1, sum all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor &t, int axis);
+Tensor operator<(const Tensor &in, const SType x);
+template <typename SType>
+void LT(const Tensor &in, const SType x, Tensor *out);
 
-/// Average elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, average all rows into a single row
-/// if 'axis' is 1, average all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<=(const Tensor &in, const SType x);
+template <typename SType>
+void LE(const Tensor &in, const SType x, Tensor *out);
 
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
-/// and shape_[axis+1]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-// Tensor Softmax(const Tensor& t, int axis = -1);
-// void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
-
-/// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<(const Tensor &t, const DType x);
-template <typename DType>
-void LT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<=(const Tensor &t, const DType x);
-template <typename DType>
-void LE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>(const Tensor &t, const DType x);
-template <typename DType>
-void GT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>=(const Tensor &t, const DType x);
-template <typename DType>
-void GE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-Tensor Pow(const Tensor &t, DType x);
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-void Pow(const Tensor &t, DType x, Tensor *ret);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor &base, Tensor exp);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>(const Tensor &in, const SType x);
+template <typename SType>
+void GT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>=(const Tensor &in, const SType x);
+template <typename SType>
+void GE(const Tensor &in, const SType x, Tensor *out);
 
 Tensor operator+(const Tensor &lhs, const Tensor &rhs);
-void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator-(const Tensor &lhs, const Tensor &rhs);
-void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator*(const Tensor &lhs, const Tensor &rhs);
-void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 Tensor operator/(const Tensor &lhs, const Tensor &rhs);
-void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
 
-template <typename DType>
-Tensor operator+(const Tensor &t, DType x);
-template <typename DType>
-void Add(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator-(const Tensor &t, DType x);
-template <typename DType>
-void Sub(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator*(const Tensor &t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator/(const Tensor &t, DType x);
-template <typename DType>
-void Div(const Tensor &t, DType x, Tensor *ret);
+template <typename SType>
+Tensor operator+(const Tensor &in, const SType x);
+template <typename SType>
+void Add(const Tensor &in, const SType x, Tensor *out);
 
-// ================Blas operations============================================
-// We fix the scalar argument type to be float.
+template <typename SType>
+Tensor operator-(const Tensor &in, const SType x);
+template <typename SType>
+void Sub(const Tensor &in, const SType x, Tensor *out);
 
-// ===== Level 1
-// TODO(wangwei) make amax/amin/asum a member function of tensor
-// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
-// void Asum(Tensor Context* ctx);
+template <typename SType>
+Tensor operator*(const Tensor &in, const SType x);
+template <typename SType>
+void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
 
-// template <typename DType>
-// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+/// For each element e of Tensor 'in', compute e / x
+template <typename SType>
+Tensor operator/(const Tensor &in, const SType x);
+/// For each element e of Tensor 'in', compute e / x into out
+template <typename SType>
+void Div(const Tensor &in, const SType x, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  result = A * B
-Tensor Mult(const Tensor &A, const Tensor &B);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  C = A * B
-void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// For each element e of Tensor 'in', compute x/e
+template <typename SType>
+Tensor Div(const SType x, const Tensor &in);
+/// For each element e of Tensor 'in', compute x/e into 'out'
+template <typename SType>
+void Div(const SType x, const Tensor &in, Tensor *out);
 
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-          const float beta, Tensor *C);
+template <typename SType>
+SType Sum(const Tensor &in);
 
-// ================Random operations==========================================
-/// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor *t);
-/// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor *t);
-/// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor *t);
+// ============Matrix (row/column) operations==================================
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor &in, const int axis);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor &in, const int axis);
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
+/// and shape_[axis]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor SoftMax(const Tensor &in, const int axis = 0);
+void SoftMax(const Tensor &in, const int axis, Tensor *out);
 
-// follow the consistency guide
-// https://issues.apache.org/jira/browse/SINGA-182
-// ============Matrix vector operations=======================================
 /// Add column 'v' with each column of matrix M
 void AddColumn(const Tensor &v, Tensor *M);
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+/// For each column 'c' of matrix out, do c=alpha*v + beta*c
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *out);
-/// Sub column 'v' by each column of matrix M
-void SubColumn(const Tensor &v, Tensor *M);
-/// Multiply column 'v' and each column of matrix M; write results into 'out'
-void MultColumn(const Tensor &v, Tensor *M);
-/// Divide column 'v' by each column of matrix M; write results into 'out'
-void DivColumn(const Tensor &v, Tensor *M);
-
 /// Add row 'v' with each row of matrix M; write results into 'out'
 void AddRow(const Tensor &v, Tensor *out);
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
-/// Sub row 'v' by each row of matrix M; write results into 'out'
-void SubRow(const Tensor &v, Tensor *M);
-/// Multiply row 'v' with each row of matrix M; write results into 'out'
-void MultRow(const Tensor &v, Tensor *M);
+/// For each row 'r' of matrix out, do r=alpha*v + beta*r
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
 /// Divide row 'v' by each row of matrix M; write results into 'out'
 void DivRow(const Tensor &v, Tensor *M);
-
-/// Sum all rows of matrix M into a single row as 'out'
-void SumRows(const Tensor &M, Tensor *out);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
 /// Sum all columns of matrix M into a single column as 'out'
 void SumColumns(const Tensor &M, Tensor *out);
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x
+// ================Random operations==========================================
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
 template <typename SType>
-Tensor Div(const SType alpha, const Tensor &in);
+void Bernoulli(const SType p, Tensor *out);
+/// Fill in Tensor 't' following Gaussian distribution.
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out);
+/// Fill in Tensor 't' following uniform distribution.
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out);
 
-/// For each element x of Tensor 'in', compute alpha/x into 'out'
+// ================Blas operations============================================
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+
+/// out = alpha*in + out
 template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out);
-
-/*
-/// Multiply each column of the lhs matrix with the rhs column
-Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
-void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Multiply each row of the lhs matrix with the rhs row
-Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
-void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Div each row of the lhs matrix with the rhs column
-Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
-void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Divide each row of the lhs matrix by the rhs row
-Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
-void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-*/
+void Axpy(SType alpha, const Tensor &in, Tensor *out);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. out = alpha lhs * rhs + beta * out
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C);
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..f4e9da2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -26,61 +26,61 @@ namespace singa {
 
 Tensor::~Tensor() {
   // LOG(ERROR) << "~";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
 
-Tensor::Tensor(const Shape &shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, const DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Tensor &t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      blob_(t.blob()), shape_(t.shape_) {
+Tensor::Tensor(const Tensor &in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      blob_(in.blob()),
+      shape_(in.shape_) {
   blob_->IncRefCount();
-  // LOG(ERROR) << "const&";
 }
 
-Tensor::Tensor(Tensor &&t)
-    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-      shape_(std::move(t.shape_)) {
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
-  // LOG(ERROR) << "&&";
+Tensor::Tensor(Tensor &&in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      shape_(std::move(in.shape_)) {
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
 }
 
-void Tensor::ResetLike(const Tensor &t) {
-  if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
-    shape_ = t.shape_;
-    device_ = t.device_;
-    data_type_ = t.data_type_;
-    blob_ = device_->NewBlob(t.MemSize());
+void Tensor::ResetLike(const Tensor &in) {
+  if (blob_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+    shape_ = in.shape_;
+    device_ = in.device_;
+    data_type_ = in.data_type_;
+    blob_ = device_->NewBlob(in.MemSize());
   }
 }
 
 void Tensor::Reshape(const Shape &shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = shape;
@@ -88,17 +88,15 @@ void Tensor::Reshape(const Shape &shape) {
 
 void Tensor::Reshape(Shape &&shape) {
   if (Product(shape_) != Product(shape)) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
   }
   shape_ = std::move(shape);
 }
 
-void Tensor::AsType(DataType type) {
+void Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
@@ -109,8 +107,7 @@ void Tensor::ToDevice(Device *dst) {
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0)
-      device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
@@ -120,7 +117,7 @@ void Tensor::ToDevice(Device *dst) {
 void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
@@ -130,8 +127,8 @@ void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
-template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num);
 
 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
@@ -162,29 +159,27 @@ Tensor Tensor::T() const {
   return t;
 }
 
-Tensor &Tensor::operator=(const Tensor &t) {
+Tensor &Tensor::operator=(const Tensor &in) {
   // LOG(ERROR) << "= const &";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = t.shape_;
-  device_ = t.device_;
-  blob_ = t.blob();
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = in.shape_;
+  device_ = in.device_;
+  blob_ = in.blob();
   blob_->IncRefCount();
   return *this;
 }
 
-Tensor &Tensor::operator=(Tensor &&t) {
+Tensor &Tensor::operator=(Tensor &&in) {
   // LOG(ERROR) << "= &&";
-  if (blob_ != nullptr && blob_->DecRefCount() == 0)
-    device_->FreeBlob(blob_);
-  transpose_ = t.transpose_;
-  data_type_ = t.data_type_;
-  shape_ = std::move(t.shape_);
-  device_ = t.device_;
-  blob_ = t.blob_;
-  t.blob_ = nullptr;
+  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = std::move(in.shape_);
+  device_ = in.device_;
+  blob_ = in.blob_;
+  in.blob_ = nullptr;
   return *this;
 }
 
@@ -200,10 +195,10 @@ Tensor Reshape(const Tensor &in, Shape &&s) {
   return out;
 }
 
-#define GenUnaryTensorArgMemberFn(op, fn)                                \
-  Tensor &Tensor::op(const Tensor &t) {                                        \
-    fn(*this, t, this);                                                        \
-    return *this;                                                              \
+#define GenUnaryTensorArgMemberFn(op, fn) \
+  Tensor &Tensor::op(const Tensor &in) {  \
+    fn(*this, in, this);                  \
+    return *this;                         \
   }
 
 GenUnaryTensorArgMemberFn(operator+=, Add);
@@ -211,12 +206,13 @@ GenUnaryTensorArgMemberFn(operator-=, Sub);
 GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFn(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFn(op, fn)                                \
-  template <typename DType> Tensor &Tensor::op(DType x) {                      \
-    fn(*this, x, this);                                                        \
-    return *this;                                                              \
-  }                                                                            \
-  template Tensor &Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFn(op, fn) \
+  template <typename DType>               \
+  Tensor &Tensor::op(const DType x) {     \
+    fn(*this, x, this);                   \
+    return *this;                         \
+  }                                       \
+  template Tensor &Tensor::op<float>(const float x)
 
 GenUnaryScalarArgMemberFn(operator-=, Sub);
 GenUnaryScalarArgMemberFn(operator+=, Add);
@@ -224,103 +220,105 @@ GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                    size_t dst_offset, size_t src_offset) {
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t dst_offset, const size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
   size_t nBytes = num * width;
-  dst_offset *= width;
-  src_offset *= width;
-  CHECK_GE(src.MemSize(), src_offset + nBytes);
-  CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+  auto d_offset = dst_offset * width;
+  auto s_offset = src_offset * width;
+  CHECK_GE(src.MemSize(), s_offset + nBytes);
+  CHECK_GE(dst->MemSize(), d_offset + nBytes);
 
   Device *src_dev = src.device(), *dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op
     if (dst_dev->lang() == kCpp) {
-      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
-                              src_offset);
+      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, d_offset,
+                              s_offset);
     } else if (src_dev->lang() == kCpp) {
-      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
-                              src_offset);
+      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, d_offset,
+                              s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
   } else {
     auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
-    src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
+    src_dev->CopyDataToFrom(to, from, nBytes, direct, d_offset, s_offset);
   }
 }
 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                                          \
-  do {                                                                         \
-    switch (type) {                                                            \
-    case kFloat32: {                                                           \
-      typedef float DType;                                                     \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kInt: {                                                               \
-      typedef int DType;                                                       \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case kChar: {                                                              \
-      typedef char DType;                                                      \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknow data type = " << DataType_Name(type);              \
-    }                                                                          \
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
   } while (0)
 
 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)                       \
-  do {                                                                         \
-    const int _SwitchShift = 3;                                                \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                     \
-    switch (_SwitchHash) {                                                     \
-    case ((kFloat32 << _SwitchShift) + kCuda): {                               \
-      typedef float DType;                                                     \
-      typedef lang::Cuda Lang;                                                 \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kCpp): {                                \
-      typedef float DType;                                                     \
-      typedef lang::Cpp Lang;                                                  \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    case ((kFloat32 << _SwitchShift) + kOpencl): {                             \
-      typedef float DType;                                                     \
-      typedef lang::Opencl Lang;                                               \
-      { __VA_ARGS__ }                                                          \
-      break;                                                                   \
-    }                                                                          \
-    default:                                                                   \
-      LOG(FATAL) << "Unknown combination of data type "                        \
-                 << DataType_Name(dtype) << " and language "                   \
-                 << LangType_Name(ltype);                                      \
-    }                                                                          \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
+  do {                                                         \
+    const int _SwitchShift = 3;                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
+    switch (_SwitchHash) {                                     \
+      case ((kFloat32 << _SwitchShift) + kCuda): {             \
+        typedef float DType;                                   \
+        typedef lang::Cuda Lang;                               \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kCpp): {              \
+        typedef float DType;                                   \
+        typedef lang::Cpp Lang;                                \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
+        typedef float DType;                                   \
+        typedef lang::Opencl Lang;                             \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      default:                                                 \
+        LOG(FATAL) << "Unknown combination of data type "      \
+                   << DataType_Name(dtype) << " and language " \
+                   << LangType_Name(ltype);                    \
+    }                                                          \
   } while (0)
 
-template <typename SType> void Tensor::SetValue(const SType x) {
+// =============Element-wise operations====================================
+template <typename SType>
+void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
   auto ptr = blob_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // cast x to DType
-    device_->Exec(
-        [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
-        {}, {ptr});
+    device_->Exec([size, x, ptr](Context *ctx) {
+      Set<DType, Lang>(size, x, ptr, ctx);
+    }, {}, {ptr});
   });
 }
 template void Tensor::SetValue<float>(const float x);
@@ -328,21 +326,19 @@ template void Tensor::SetValue<float>(const float x);
 #define EltwiseUnaryTensorFn(fn, t, ret)                               \
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
-      ret->device()->Exec(                                             \
-          [t, ret](Context* ctx) {                                     \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
-          },                                                           \
-          {t.blob()}, {ret->blob()});                                  \
+      ret->device()->Exec([t, ret](Context * ctx) {                    \
+        fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);         \
+      }, {t.blob()}, {ret->blob()});                                   \
     });                                                                \
   } while (0)
 
-#define GenUnaryTensorFn(fn)                          \
-  Tensor fn(const Tensor &t) {                        \
-    Tensor ret(t.shape(), t.device(), t.data_type()); \
-    auto *retptr = &ret;                              \
-    EltwiseUnaryTensorFn(fn, t, retptr);              \
-    return ret;                                       \
-  }                                                   \
+#define GenUnaryTensorFn(fn)                             \
+  Tensor fn(const Tensor &in) {                          \
+    Tensor ret(in.shape(), in.device(), in.data_type()); \
+    auto *retptr = &ret;                                 \
+    EltwiseUnaryTensorFn(fn, in, retptr);                \
+    return ret;                                          \
+  }                                                      \
   void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
 
 GenUnaryTensorFn(Abs);
@@ -355,33 +351,89 @@ GenUnaryTensorFn(Sqrt);
 GenUnaryTensorFn(Square);
 GenUnaryTensorFn(Tanh);
 
-// TODO(wangwei) conside async exec
-template <> float Sum<float>(const Tensor &t) {
-  float s = 0.0f;
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    t.device()->Exec(
-        [t, &s](Context *ctx) {
-          Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
-        },
-        {t.blob()}, {});
-  });
-  return s;
-}
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
+  do {                                                                         \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
+      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                     \
+        fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx); \
+      }, {lhs.blob(), rhs.blob()}, {ret->blob()});                             \
+    });                                                                        \
+  } while (0)
 
-Tensor Sum(const Tensor &M, int axis) {
-  if (axis == 0) {
-    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
-    SumRows(M, &out);
-    return out;
-  } else {
-    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
-    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
-    SumColumns(M, &out);
-    return out;
+#define GenBinaryTensorFn(op, fn)                              \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
   }
+
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
+  do {                                                                  \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
+      static_assert(std::is_same<SType, DType>::value,                  \
+                    "The Scalar type must match the Tensor data type"); \
+      ret->device()->Exec([t, x, ret](Context * ctx) {                  \
+        fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);       \
+      }, {t.blob()}, {ret->blob()});                                    \
+    });                                                                 \
+  } while (0)
+
+#define GenTensorScalarFn(op, fn)                             \
+  template <typename SType>                                   \
+  Tensor op(const Tensor &in, const SType x) {                \
+    Tensor ret(in.shape(), in.device(), in.data_type());      \
+    fn(in, x, &ret);                                          \
+    return ret;                                               \
+  }                                                           \
+  template <typename SType>                                   \
+  void fn(const Tensor &in, const SType x, Tensor *ret) {     \
+    EltwiseTensorScalarFn(fn, in, x, ret);                    \
+  }                                                           \
+  template Tensor op<float>(const Tensor &in, const float x); \
+  template void fn<float>(const Tensor &in, const float x, Tensor *ret)
+
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
 }
+template Tensor Div<float>(const float, const Tensor &);
 
-Tensor Average(const Tensor &t, int axis) {
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec([alpha, in, out](Context *ctx) {
+      Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+    }, {in.blob()}, {out->blob()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+// =============Matrix operations============================================
+Tensor Average(const Tensor &M, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -396,10 +448,34 @@ Tensor Average(const Tensor &t, int axis) {
   //    ....
   // }
   if (axis == 0) {
-    return Sum(t, 0) / (1.0f * t.shape().at(0));
+    return Sum(M, 0) / (1.0f * M.shape(0));
   } else {
     CHECK_EQ(axis, 1);
-    return Sum(t, 1) / (1.0f * t.shape().at(1));
+    return Sum(M, 1) / (1.0f * M.shape(1));
+  }
+}
+// TODO(wangwei) conside async exec
+template <>
+float Sum<float>(const Tensor &in) {
+  float s = 0.0f;
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    in.device()->Exec([in, &s](Context *ctx) {
+      Sum<DType, Lang>(in.Size(), in.blob(), &s, ctx);
+    }, {in.blob()}, {});
+  });
+  return s;
+}
+
+Tensor Sum(const Tensor &M, int axis) {
+  if (axis == 0) {
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
   }
 }
 
@@ -424,141 +500,10 @@ void SoftMax(const Tensor &in, int axis, Tensor *out) {
   DivColumn(sum, out);
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        \
-      ret->device()->Exec(                                                     \
-          [lhs, rhs, ret](Context *ctx) {                                      \
-            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),   \
-                            ctx);                                              \
-          },                                                                   \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                            \
-    });                                                                        \
-  } while (0)
-
-#define GenBinaryTensorFn(op, fn)                                        \
-  Tensor op(const Tensor &lhs, const Tensor &rhs) {                            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    \
-    fn(lhs, rhs, &ret);                                                        \
-    return ret;                                                                \
-  }                                                                            \
-  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {                 \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  \
-  }
-
-GenBinaryTensorFn(operator+, Add);
-GenBinaryTensorFn(operator-, Sub);
-GenBinaryTensorFn(operator*, EltwiseMult);
-GenBinaryTensorFn(operator/, Div);
-GenBinaryTensorFn(Pow, Pow);
-
-#define EltwiseTensorScalarFn(fn, t, x, ret)                                   \
-  do {                                                                         \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         \
-      static_assert(std::is_same<SType, DType>::value,                         \
-                    "The Scalar type must match the Tensor data type");        \
-      ret->device()->Exec(                                                     \
-          [t, x, ret](Context *ctx) {                                          \
-            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);          \
-          },                                                                   \
-          {t.blob()}, {ret->blob()});                                          \
-    });                                                                        \
-  } while (0)
-
-#define GenTensorScalarFn(op, fn)                                        \
-  template <typename SType> Tensor op(const Tensor &t, SType x) {              \
-    Tensor ret(t.shape(), t.device(), t.data_type());                          \
-    fn(t, x, &ret);                                                            \
-    return ret;                                                                \
-  }                                                                            \
-  template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) {   \
-    EltwiseTensorScalarFn(fn, t, x, ret);                                      \
-  }                                                                            \
-  template Tensor op<float>(const Tensor &t, float x);                         \
-  template void fn<float>(const Tensor &t, const float x, Tensor *ret)
-
-GenTensorScalarFn(operator+, Add);
-GenTensorScalarFn(operator-, Sub);
-GenTensorScalarFn(operator*, EltwiseMult);
-GenTensorScalarFn(operator/, Div);
-GenTensorScalarFn(Pow, Pow);
-GenTensorScalarFn(operator<, LT);
-GenTensorScalarFn(operator<=, LE);
-GenTensorScalarFn(operator>, GT);
-GenTensorScalarFn(operator>=, GE);
-
-// ================Blas operations============================================
-Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
-  Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
-  Mult(lhs, rhs, &ret);
-  return ret;
-}
-
-void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
-  Mult(1.0f, lhs, rhs, 0.0f, ret);
-}
-
-void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta,
-          Tensor *C) {
-  CHECK_EQ(A.shape().size(), 2u);
-  if (B.nDim() == 1u) {
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
-                              A.blob(), B.blob(), beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  } else {
-    CHECK(!C->transpose());
-    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
-      C->device()->Exec(
-          [alpha, A, beta, B, C](Context *ctx) {
-            GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
-                              B.shape(1), A.shape(1), alpha, A.blob(), B.blob(),
-                              beta, C->blob(), ctx);
-          },
-          {A.blob(), B.blob()}, {C->blob()});
-    });
-  }
-}
-
-void Bernoulli(float p, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [p, t](Context *ctx) {
-          Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Uniform(float low, float high, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [low, high, t](Context *ctx) {
-          Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-void Gaussian(float mean, float std, Tensor *t) {
-  TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
-    t->device()->Exec(
-        [mean, std, t](Context *ctx) {
-          Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
-        },
-        {}, {t->blob()}, true);
-  });
-}
-
-// ======follow the consistency guide
 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
 /// Add column 'v' onto each column of matrix M;
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
@@ -570,15 +515,19 @@ void AddColumn(const float alpha, const float beta, const Tensor &v,
     CHECK_EQ(nb_row, v.Size());
 
     Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
 }
+template <>
+void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
+
 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
 
 /// Sub column 'v' by each column of matrix M; write results into 'out'
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
     Tensor X = M->T();
     AddColumn(v, &X);
@@ -594,29 +543,8 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
-
-template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-  Tensor out(in.shape(), in.device(), in.data_type());
-  Div(alpha, in, &out);
-  return out;
-}
-
-template Tensor Div<float>(const float, const Tensor &);
-
-template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out) {
-  CheckDataTypeAndLang(in, *out);
-  CHECK(in.shape() == out->shape());
-  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    // TODO(wangwei) type cast SType to DType;
-    in.device()->Exec(
-        [alpha, in, out](Context *ctx) {
-          Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-        },
-        {in.blob()}, {out->blob()});
-  });
-}
-template void Div<float>(const float, const Tensor &, Tensor *);
+template <>
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
 
 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
@@ -640,12 +568,10 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
-                            v.blob(), M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -657,12 +583,10 @@ void MultRow(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec(
-        [M, v](Context *ctx) {
-          DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
-                            M->blob(), ctx);
-        },
-        {M->blob(), v.blob()}, {M->blob()});
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
+                        M->blob(), ctx);
+    }, {M->blob(), v.blob()}, {M->blob()});
   });
 }
 
@@ -680,8 +604,8 @@ void SumColumns(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
     CHECK_EQ(nb_row, v->Size());
 
-    Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_col}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Mult(M, one, v);
   }
 }
@@ -695,10 +619,98 @@ void SumRows(const Tensor &M, Tensor *v) {
     size_t nb_row = M.shape(0), nb_col = M.shape(1);
     CHECK_EQ(nb_col, v->Size());
 
-    Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
-    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor one(Shape{nb_row}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
     Tensor X = M.T();
     Mult(X, one, v);
   }
 }
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto prob = TypeCast<SType, DType>(p);
+    out->device()->Exec([prob, out](Context *ctx) {
+      Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto l = TypeCast<SType, DType>(low);
+    auto h = TypeCast<SType, DType>(high);
+    out->device()->Exec([l, h, out](Context *ctx) {
+      Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto m = TypeCast<SType, DType>(mean);
+    auto s = TypeCast<SType, DType>(std);
+    out->device()->Exec([m, s, out](Context *ctx) {
+      Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+    }, {}, {out->blob()}, true);
+  });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    auto a = TypeCast<SType, DType>(alpha);
+    out->device()->Exec([a, in, out](Context *ctx) {
+      Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+    }, {in.blob(), out->blob()}, {out->blob()});
+  });
+}
+template <>
+void Axpy(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+  Shape s;
+  s.push_back(A.shape(0));
+  if (B.nDim() == 2) s.push_back(B.shape(1));
+  Tensor out(s, A.device(), A.data_type());
+  Mult(A, B, &out);
+  return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+  Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C) {
+  CHECK_EQ(A.shape().size(), 2u);
+  if (B.nDim() == 1u) {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                          B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  } else {
+    CHECK(!C->transpose());
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                          A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+      }, {A.blob(), B.blob()}, {C->blob()});
+    });
+  }
+}
+
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 1bf6fc7..b5d0ba9 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -29,12 +29,14 @@ namespace singa {
 /// device programming language, e.g., Langice::kCpp, Langice::kCuda
 ///
 /// TODO(wangwei) Clean the functions to make the function APIs consistent:
-/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the first
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the
+/// first
 ///    letter.
 /// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Blob] [mutable Blob].
+/// 3. Function arguments order is [const basic type] [const Blob] [mutable
+/// Blob].
 /// 4. Function argument names, use 'num' for total number of elements in
-///    elementwise operations; use 'in1' 'in2' for input blobs; use 'out' for
+///    elementwise operations; use 'in1' 'in2' for in blobs; use 'out' for
 ///    output blob or value. With exceptions for some functions, e.g.,
 ///      Scale(const float alpha, const Blob* in, Blob* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
@@ -46,262 +48,283 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-
-// ================Linear algebra functions====================================
-/// ret[i] = |input[i]|
+// =============Element-wise operations====================================
+/// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
+/// out = in + x
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Set Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
 
-/// sum all elements of input into ret
+/// out = in1 + in2
 template <typename DType, typename Lang>
-void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
-  LOG(FATAL) << "Sum Not Implemented";
+void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
 }
-
-/// ret[i] = sign(input[i])
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sign Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// Base is e, Neper number. ret[i]=exp(input[i])
+/// out = x / in
 template <typename DType, typename Lang>
-void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Exp Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
-template <typename DType, typename Lang>
-void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Log Not Implemented";
-}
-/// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sqrt Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  CHECK_NE(x, 0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// Element-wise operation, ret[i]=square([input[i])
+/// out = in1 / in2
 template <typename DType, typename Lang>
-void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// Element-wise operation, ret[i]=tanh([input[i])
+/// out = in * x
 template <typename DType, typename Lang>
-void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Tanh Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
-/// Element-wise operation, ret[i]=max(0, input[i])
+
+/// out = in2 * in2
 template <typename DType, typename Lang>
-void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "ReLU Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
-/// Element-wise operation, ret[i]=sigmoid([input[i])
+
+/// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sigmoid Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
-// Do softmax for each row invidually
+/// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
+/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
-/// Sum the columns of the input matrix into a vector
+/// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
-	        Blob *out, Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out 
+/// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
-
-/// Add the vector v to every column of A as the column of out
+/// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+        Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-
-/// Element-wise operation, do v^x for every v from the input tensor
+/// Element-wise operation, do v^x for every v from the in tensor
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Element-wise operation, out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
-  LOG(FATAL) << "Clamp Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 
-/// ret = input + x
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in, const DType x, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
-
-/// ret = lhs + rhs
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Add-Pair Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// ret =  input - x
+/// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  Add<DType, Lang>(num, in, -x, out, ctx);
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
-
-/// ret = lhs - rhs
+/// Element-wise operation, out[i]=sqrt([in[i])
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Sub-Pair Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// ret = input * x
+/// Element-wise operation, out[i]=square([in[i])
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
-                 Context *ctx) {
-  LOG(FATAL) << "EltwiseMult Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
-/// ret = lhs * rhs
+/// out =  in - x
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
-		 Blob *out, Context *ctx) {
-  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
+         Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// ret = input / x
+/// out = in1 - in2
 template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Blob *in, 
-	 Blob *out, Context *ctx) { 
-  LOG(FATAL) << "Div Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-
+/// sum all elements of in into out
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  CHECK_NE(x,0.f);
-  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// ret = lhs / rhs
+/// Element-wise operation, out[i]=tanh([in[i])
 template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in1, const Blob *in2, 
-	 Blob *out, Context *ctx) {
-  LOG(FATAL) << "Div-Pair Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 
+// =========== Matrix operations ===========================================
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
 /// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
-	   Blob *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
-
-/// ret[i]=(input[i]<x)?1.f:0.f
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LT Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
-/// ret[i]=(input[i]<=x)?1.f:0.f
+/// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "LE Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GT Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lang>
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "GE Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, Blob *out,
+              Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lang>
+void Uniform(const size_t num, const float low, const float high, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
+/// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
-/// return the index of the element with the min value.
+/// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
 void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
-/// ret = sum |x| for all x in input
+/// out = sum |x| for all x in in
 template <typename DType, typename Lang>
 void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
-/// ret = alpha * input + ret
+/// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Blob *in, 
-	  Blob *out, Context *ctx) {
+void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
-/// ret *= x
+/// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, 
-	 DType *out, Context *ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+         Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
 
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+/// out = alpha * A * v + beta * out.
+/// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
-	  const Blob *A, const Blob *v,
-          const DType beta, Blob *out, Context *ctx) {
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+          const Blob *A, const Blob *v, const DType beta, Blob *out,
+          Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
 
@@ -323,34 +346,5 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Bernoulli Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, 
-	     Blob *out, Context *ctx) {
-  LOG(FATAL) << "Uniform Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, 
-	      Blob *out, Context *ctx) {
-  LOG(FATAL) << "Gaussian Not Implemented";
-}
-
-
-
-
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_