You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/12 07:27:57 UTC

[5/5] incubator-singa git commit: SINGA-168 Implement Cpp Math functions APIs

SINGA-168 Implement Cpp Math functions APIs

Update error log for tensor_math.h to include the function name, e.g.
"Foo is not implemented".

Add Tensor Math Cpp Implementation and Test Cases


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/07c49da5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/07c49da5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/07c49da5

Branch: refs/heads/dev
Commit: 07c49da5b1ee6582780f5faef6c6bf3418a7a0b6
Parents: 01aaf49
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Fri Jun 3 20:46:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 src/core/tensor/tensor_math.h     | 293 +++++++++----------
 src/core/tensor/tensor_math_cpp.h | 508 ++++++++++++++++++++++++---------
 test/singa/test_tensor_math.cc    | 264 ++++++++++++++++-
 3 files changed, 774 insertions(+), 291 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..1bf6fc7 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,277 +50,259 @@ namespace singa {
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
 template <typename DType, typename Lang>
-void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Abs Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Set(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
 }
+
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
-void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
 template <typename DType, typename Lang>
-void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
 template <typename DType, typename Lang>
-void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
 template <typename DType, typename Lang>
-void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
 }
-
 /// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=square([input[i])
 template <typename DType, typename Lang>
-void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Square Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
-void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
 template <typename DType, typename Lang>
-void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
 template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
-/// Do softmax for each row invidually
+// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Softmax Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
 }
 
 /// Sum the columns of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
-                Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, 
+	        Blob *out, Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
+/// Add the vector v to every row of A as the row of out 
 template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
 }
 
-/// Add the vector v to every column of A as the column of ret
+/// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
-            Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, 
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from rhs
 template <typename DType, typename Lang>
-void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in, 	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// ret = input + x
 template <typename DType, typename Lang>
-void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
 }
+
+/// ret = lhs + rhs
+template <typename DType, typename Lang>
+void Add(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
+}
+
 /// ret =  input - x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  Add<DType, Lang>(count, input, -x, ret, ctx);
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
 }
-/// ret = input * x
+
+/// ret = lhs - rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
 }
-/// ret = input / x
+
+/// ret = input * x
 template <typename DType, typename Lang>
-void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
-  EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// ret = lhs + rhs
+/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, 
+		 Blob *out, Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
-/// ret = lhs - rhs
+/// ret = input / x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, 
+	 Blob *out, Context *ctx) { 
+  LOG(FATAL) << "Div Not Implemented";
 }
 
-/// ret = lhs * rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
-                 Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  CHECK_NE(x,0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
 /// ret = lhs / rhs
 template <typename DType, typename Lang>
-void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, 
+	 Blob *out, Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
-           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, 
+	   Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// ret[i]=(input[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
 }
 /// ret[i]=(input[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
 }
 /// ret[i]=(input[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+/// ret[i]=(input[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
 }
 
 // ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // ===== Level 1
 /// return the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// return the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(int count, const Blob *input, int *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amin Not Implemented";
 }
 /// ret = sum |x| for all x in input
 template <typename DType, typename Lang>
-void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// ret = alpha * input + ret
 template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Axpy(const size_t num, const DType alpha, const Blob *in, 
+	  Blob *out, Context *ctx) {
+  LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// ret *= x
 template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Scale Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Dot(const size_t num, const Blob *in1, const Blob *in2, 
+	 DType *out, Context *ctx) {
+  LOG(FATAL) << "Dot Not Implemented";
 }
 
 // ===== Level 2
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
 template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-          DType beta, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-// ========follow the consistency guide of math API
-
-template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// Divide alpha by each element of 'in'.
-template <typename DType, typename Lang>
-void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-         Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, 
+	  const Blob *A, const Blob *v,
+          const DType beta, Blob *out, Context *ctx) {
+  LOG(FATAL) << "GEMV Not Implemented";
 }
 
 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
@@ -328,7 +310,7 @@ void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
 template <typename DType, typename Lang>
 void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "DGMM Not Implemented";
 }
 
 /// C = alpha * A * B + beta * C.
@@ -338,32 +320,37 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
           const size_t ncolB, const size_t ncolA, const DType alpha,
           const Blob *A, const Blob *B, const DType beta, Blob *C,
           Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+  LOG(FATAL) << "GEMM Not Implemented";
 }
-/// ret[i]=(input[i]<x)?1.f:0.f
-template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-/// ret[i]=(input[i]<=x)?1.f:0.f
+
+
+// ===== Level 3
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
 }
-/// ret[i]=(input[i]>x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Uniform(const size_t num, const float low, const float high, 
+	     Blob *out, Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
 }
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-        Context *ctx) {
-  LOG(FATAL) << "Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, 
+	      Blob *out, Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
 }
 
+
+
+
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..ec7a892 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -27,195 +27,317 @@
 
 /// TODO(wangwei) Clean the implementations following the comments in
 /// tensor_math.h.
-/// For Blob argument xxx, name its pointer as xxxPtr.
 namespace singa {
+
+template<>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = fabs(inPtr[i]);
+  }
+}
+
 template <>
-void Square<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                              Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = in[i] * in[i];
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    s += inPtr[i];
   }
+  *out = s;
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + rptr[i];
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float*>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; 
   }
 }
 
 template <>
-void Add<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] + x;
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = exp(inPtr[i]);
   }
 }
 
 template <>
-void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  // CHECK_EQ(ctx->stream, nullptr);
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] - rptr[i];
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
   }
 }
 
-// sum all elements of input into ret
-// TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
-                           Context *ctx) {
-  float s = 0.f;
-  const float *in = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    s += in[i];
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
   }
-  *ret = s;
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * x;
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                                   Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = lptr[i] * rptr[i];
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = tanh(inPtr[i]);
   }
 }
 
 template <>
-void Exp<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = exp(lptr[i]);
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
   }
 }
 
 template <>
-void Log<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    CHECK_GT(lptr[i], 0.f);
-    dptr[i] = log(lptr[i]);
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
   }
 }
 
 template <>
-void Tanh<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = tanh(lptr[i]);
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+		float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+			bPtr[c] = exp(inPtr[offset + c]);
+			denom += bPtr[c];
+    }
+		for (size_t c = 0; c < ncol; c++) {
+			size_t idx = offset + c;
+			outPtr[idx] = bPtr[c] / denom;
+		}
   }
+	delete bPtr;
 }
 
 template <>
-void ReLU<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                            Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = (lptr[i] >= 0.f) ? lptr[i] : 0.f;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+             Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		outPtr[r] = 0.f;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[r] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t c = 0; c < ncol; c++) {
+		outPtr[c] = 0.f;
+	}
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+				outPtr[c] += inPtr[offset + c];
+		}
+	}
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+		}
+	}
+}
+
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());                              
+  const float *vPtr = static_cast<const float *>(v->data());                              
+	for (size_t r = 0; r < nrow; r++) {
+		size_t offset = r * ncol;
+		for (size_t c = 0; c < ncol; c++) {
+			outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+		}
+	}
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(inPtr[i], x);
   }
 }
 
 template <>
-void Sigmoid<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
-                               Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = 1.f / (1.f + exp(-lptr[i]));
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr= static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
-                           Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(input->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], x);
+void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
+														 Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());                              
+	for (size_t i = 0; i < num; i++) {
+		if (inPtr[i] > high) {
+			outPtr[i] = high;
+		}
+		else if (inPtr[i] < low) {
+			outPtr[i] = low;
+		}
+		else {
+			outPtr[i] = inPtr[i];			
+		}
+	}
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, 
+													 Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
   }
 }
 
 template <>
-void Pow<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
-                           Blob *ret, Context *ctx) {
-  float *dptr = static_cast<float *>(ret->mutable_data());
-  const float *lptr = static_cast<const float *>(lhs->data());
-  const float *rptr = static_cast<const float *>(rhs->data());
-  for (int i = 0; i < count; i++) {
-    dptr[i] = pow(lptr[i], rptr[i]);
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
   }
 }
 
 template <>
-void Bernoulli<float, lang::Cpp>(int count, float p, Blob *ret, Context *ctx) {
-  std::bernoulli_distribution distribution(p);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                           Blob *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
   }
 }
 
 template <>
-void Uniform<float, lang::Cpp>(int count, float low, float high, Blob *ret,
-                               Context *ctx) {
-  std::uniform_real_distribution<float> distribution(low, high);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * x;
   }
 }
 
 template <>
-void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
-                                Context *ctx) {
-  std::normal_distribution<float> distribution(mean, std);
-  float *ptr = static_cast<float *>(ret->mutable_data());
-  for (int i = 0; i < count; i++) {
-    ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
   }
 }
 
-// follow the consistency guide of math API
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                                   Blob *out, Context *ctx) {
+  float *outPtr= static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(in2Ptr[i],0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, 
+         								  Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = alpha / inPtr[i];
+  for (size_t i = 0; i < num; i++) {
+		CHECK_NE(inPtr[i],0.f);
+    outPtr[i] = x / inPtr[i];
+  }
 }
+
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+	float *outPtr= static_cast<float *>(out->mutable_data());
+	const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+	for (size_t r = 0; r < m ; r++) {
+		size_t offset = r * n;
+		for (size_t c = 0; c < n; c++) {
+			outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+		}
+	}
+}
+
 template <>
 void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
                           Blob *out, Context *ctx) {
@@ -227,6 +349,125 @@ void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
 }
 
 template <>
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+                          Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t maxPos = 0;
+	float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			maxVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > maxVal) {
+			maxVal = inPtr[i];
+			maxPos = i;
+		}
+	}
+	*out = maxPos;
+}
+
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+	size_t minPos = 0;
+	float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		if (i == 0) {
+			minVal = inPtr[i]; 
+		}
+		else if (inPtr[i] > minVal) {
+			minVal = inPtr[i];
+			minPos = i;
+		}
+	}
+	*out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+	float sum = 0;
+	const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {
+		sum += fabs(inPtr[i]);
+	}
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+          							 	  Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+	for (size_t i = 0; i < num; i++) {	
+		outPtr[i] += alpha * inPtr[i];
+	}
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	for (size_t i = 0; i < num; i++) {
+		outPtr[i] *= x;
+	}
+}
+
+//template <>
+//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+//         									 float *out, Context *ctx) {
+//	float sum = 0;
+//	const float *in1Ptr = static_cast<const float *>(in1->data());
+//	const float *in2Ptr = static_cast<const float *>(in2->data());
+//	for (size_t i = 0; i < num; i++) {
+//		sum += in1Ptr[i] * in2Ptr[i];
+//	}
+//}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
+          const Blob *A, const Blob *v, const float beta, 
+					Blob *out, Context *ctx) {
+	float *outPtr = static_cast<float *>(out->mutable_data());
+	const float* APtr = static_cast<const float *>(A->data());
+	const float* vPtr = static_cast<const float *>(v->data());
+	for (size_t r = 0; r < m; r++) {
+		float sum = 0; 
+		for (size_t c = 0; c < n; c++) {
+			size_t idx = trans ? c * m + r : r * n + c;	
+			sum += APtr[idx] * vPtr[c];
+		}
+		outPtr[r] = alpha * sum + beta * outPtr[r];
+	}
+}
+
+template <>
 void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
                             const size_t ncol, const Blob *M, const Blob *v,
                             Blob *out, Context *ctx) {
@@ -251,41 +492,35 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 }
 
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
-template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
+  std::bernoulli_distribution distribution(p);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
+                               Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr= static_cast<float *>(out->mutable_data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
-                          Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
+                                Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
+
 #ifdef USE_CBLAS
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -314,7 +549,6 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
 }
 
 #endif  // USE_CBLAS
-
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..823445f 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -11,15 +11,277 @@ protected:
     b.Reshape(singa::Shape{6});
     c.Reshape(singa::Shape{6, 1});
     d.Reshape(singa::Shape{3, 2});
+		e.Reshape(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
+		e.CopyDataFromHostPtr<float>(dat1, 6);
   }
-  Tensor a, b, c, d;
+  Tensor a, b, c, d, e;
   const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
 };
 
+TEST_F(TestTensorMath, MemberAbs) {
+	Tensor aa = a.Clone();
+	Tensor bb = b.Clone();
+	Tensor cc = aa - bb;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+
+	Tensor p = Abs(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberExp) {
+	Tensor p = Exp(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLog) {
+	Tensor p = Log(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberReLU) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = ReLU(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSigmoid) {
+	Tensor p = Sigmoid(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSign) {
+	Tensor aa = a.Clone();
+	Tensor cc = aa - 2.0f;
+	const float* dptr = cc.data<const float*>();
+	EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+	Tensor p = Sign(cc);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_EQ(0.0f, dptr1[0]);
+  EXPECT_EQ(0.0f, dptr1[1]);
+  EXPECT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberSqrt) {
+	Tensor p = Sqrt(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSquare) {
+	Tensor p = Square(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+  EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberTanh) {
+	Tensor p = Tanh(a);
+	const float* dptr1 = p.data<const float*>();
+	EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, Sum) {
+	Tensor p1(Shape{1,2});
+	p1 = Sum(e, 0);
+  const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
+	EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+	
+	Tensor p2(Shape{3,1});
+	p2 = Sum(e, 1);
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
+	EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
+	EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+}
+
+TEST_F(TestTensorMath, SoftMax) {
+	Tensor p1(Shape{3,2});
+	p1 = SoftMax(e,0);
+  const float *dptr1 = p1.data<const float *>();
+	float sum = 0;
+	for(int i = 0; i < 6; i++) sum += exp(i+1);
+	EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
+	EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
+	EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
+	EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
+	EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
+	EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
+	
+	Tensor p2(Shape{3,2});
+	p2 = SoftMax(e,1); 
+  const float *dptr2 = p2.data<const float *>();
+	EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
+	EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLT) {
+	Tensor p1 = a < 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberLE) {
+	Tensor p1 = a <= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGT) {
+	Tensor p1 = a > 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGE) {
+	Tensor p1 = a >= 2.0f;
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+	EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberPow) {
+	Tensor p1 = Pow(b,3.0f);
+	const float *dptr1 = p1.data<const float *>();
+	EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
+	EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
+	EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+
+	//TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
+	//Tensor p2 = Pow(a,b);
+	//const float *dptr2 = p2.data<const float *>();
+	//EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+	//EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+	//EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+}
+
+
+TEST_F(TestTensorMath, MemberSub) {
+	Tensor p1 = a - b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberEltwiseMult) {
+	Tensor p1 = a * b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberDiv) {
+	Tensor p1 = a / b;
+	const float* dptr1 = p1.data<const float*>();
+	EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+
+	Tensor p2 = Div(10.0f,b);
+	const float* dptr2 = p2.data<const float*>();
+	EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+
+	Tensor p3 = a / 8.0f;
+	const float* dptr3 = p3.data<const float*>();
+	EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberBernoulli) {
+	Tensor p1(Shape{10000});
+	Bernoulli(0.3,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.3, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberUniform) {
+	Tensor p1(Shape{10000});
+	Uniform(0.1f,0.2f,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 10000; i++) sum += dptr1[i];
+	float mean = sum/10000;
+	EXPECT_NEAR(mean, 0.15f, 1e-3);
+
+	sum = 0;
+	for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/9999;
+	EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+}
+
+TEST_F(TestTensorMath, MemberGaussian) {
+	Tensor p1(Shape{50000});
+	Gaussian(0.0,1.0,&p1);
+	const float* dptr1 = p1.data<const float*>();
+	float sum = 0;
+	for(int i = 0; i < 50000; i++) sum += dptr1[i];
+	float mean = sum/50000;
+	EXPECT_NEAR(mean, 0.0, 1e-2);
+
+	sum = 0;
+	for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+	float variance = sum/49999;
+	EXPECT_NEAR(variance, 1.0, 1e-2);
+}
+
+
+
 TEST_F(TestTensorMath, MemberAddTensor) {
   Tensor aa = a.Clone();
   aa += a;