You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/13 13:20:38 UTC

[45/50] [abbrv] incubator-singa git commit: SINGA-184 Add Cross Entropy loss computation

SINGA-184 Add Cross Entropy loss computation

Merge with asf dev.
Fix the bug for cuda crossentropy caused by the target type (should use
int instead of float).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/21e4b2d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/21e4b2d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/21e4b2d7

Branch: refs/heads/master
Commit: 21e4b2d79a4bd3acfb8b487cf96c197da464ae70
Parents: ec17aca 26df5ac
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 13 13:04:23 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 13 13:04:23 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                          |   4 +-
 cmake/Dependencies.cmake                |   7 +-
 cmake/ProtoBuf.cmake                    | 116 ----
 include/singa/core/common.h             |   2 +-
 include/singa/core/tensor.h             | 400 +++++++-------
 src/CMakeLists.txt                      |  27 +-
 src/core/tensor/math_kernel.cu          | 707 +++++++++++++------------
 src/core/tensor/math_kernel.h           |  93 ++--
 src/core/tensor/tensor.cc               | 761 ++++++++++++++-------------
 src/core/tensor/tensor_math.h           | 404 +++++++-------
 src/core/tensor/tensor_math_cpp.h       | 610 +++++++++++++++------
 src/core/tensor/tensor_math_cuda.h      | 423 +++++++++++----
 src/model/layer/activation.cc           |  10 +-
 src/model/layer/batchnorm.cc            |  70 +++
 src/model/layer/batchnorm.h             |  84 +++
 src/model/layer/cudnn_activation.cc     |  13 +-
 src/model/layer/cudnn_batchnorm.cc      | 214 ++++++++
 src/model/layer/cudnn_batchnorm.h       |  60 +++
 src/model/layer/cudnn_convolution.cc    | 183 ++++---
 src/model/layer/cudnn_lrn.cc            | 118 +++++
 src/model/layer/cudnn_lrn.h             |  56 ++
 src/model/layer/cudnn_pooling.cc        |   7 +-
 src/model/layer/cudnn_softmax.cc        |   4 +-
 src/model/layer/dense.cc                |  86 +++
 src/model/layer/dense.h                 |  70 +++
 src/model/layer/flatten.cc              |  55 ++
 src/model/layer/flatten.h               |  53 ++
 src/model/layer/lrn.cc                  |  59 +++
 src/model/layer/lrn.h                   |  70 +++
 src/model/layer/prelu.cc                | 139 +++++
 src/model/layer/prelu.h                 |  59 +++
 src/model/layer/softmax.cc              |  10 +-
 src/model/loss/softmax_cross_entropy.cc |   8 +-
 src/proto/model.proto                   |  39 +-
 test/CMakeLists.txt                     |   1 +
 test/singa/test_activation.cc           |   8 +-
 test/singa/test_cross_entropy.cc        |   4 +-
 test/singa/test_cudnn_activation.cc     |   6 +-
 test/singa/test_cudnn_batchnorm.cc      | 257 +++++++++
 test/singa/test_cudnn_convolution.cc    | 181 +++++++
 test/singa/test_cudnn_lrn.cc            | 205 ++++++++
 test/singa/test_cudnn_softmax.cc        |   6 +-
 test/singa/test_dense.cc                | 249 +++++++++
 test/singa/test_flatten.cc              | 144 +++++
 test/singa/test_mse.cc                  |   1 -
 test/singa/test_prelu.cc                | 137 +++++
 test/singa/test_sgd.cc                  |   2 +-
 test/singa/test_softmax.cc              |  12 +-
 test/singa/test_tensor_math.cc          | 303 ++++++++++-
 49 files changed, 4873 insertions(+), 1664 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index fbe3adc,fbe3adc..a9d9b17
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,7 -10,7 +10,9 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
  IF(UNIX OR APPLE)
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
  ENDIF()
--
++IF(CMAKE_BUILD_TYPE=Debug)
++  SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
++ENDIF()
  #message(STATUS "${CMAKE_CXX_FLAGS}")
  SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
  #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 865e1e4,cd750c5..8cfa705
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -212,199 -213,171 +213,180 @@@ Tensor Reshape(const Tensor &in, Shape 
  
  /// Copy 'num' elements of src to dst.
  /// The first 'src_offset' ('dst_offset') elements will be skipped.
- void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
-                     size_t src_offset = 0, size_t dst_offset = 0);
- 
- // ==================Simple Linear Algebra Operations=========================
- Tensor Abs(const Tensor &t);
- Tensor Exp(const Tensor &t);
- Tensor Log(const Tensor &t);
- Tensor ReLU(const Tensor &t);
- Tensor Sigmoid(const Tensor &t);
- Tensor Sign(const Tensor &t);
- Tensor Sqrt(const Tensor &t);
- Tensor Square(const Tensor &t);
- Tensor Tanh(const Tensor &t);
+ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                     const size_t src_offset = 0, const size_t dst_offset = 0);
+ 
+ // =============Element-wise operations====================================
+ Tensor Abs(const Tensor &in);
+ Tensor Exp(const Tensor &in);
+ Tensor Log(const Tensor &in);
+ Tensor ReLU(const Tensor &in);
+ Tensor Sigmoid(const Tensor &in);
+ Tensor Sign(const Tensor &in);
+ Tensor Sqrt(const Tensor &in);
+ Tensor Square(const Tensor &in);
+ Tensor Tanh(const Tensor &in);
+ 
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ Tensor Pow(const Tensor &in, const SType x);
+ /// Element-wise opeartion, out[i]=in[i]^x
+ template <typename SType>
+ void Pow(const Tensor &in, const SType x, Tensor *out);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ Tensor Pow(const Tensor &base, const Tensor &exp);
+ /// Element-wise opeartion, out[i]=baes[i]^exp[i]
+ void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
  
+ /// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
  template <typename SType>
- SType Sum(const Tensor &t);
- /// Sum elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, sum all rows into a single row
- /// if 'axis' is 1, sum all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.sum
- Tensor Sum(const Tensor &t, int axis);
+ Tensor operator<(const Tensor &in, const SType x);
+ template <typename SType>
+ void LT(const Tensor &in, const SType x, Tensor *out);
  
- /// Average elements in the Tensor, currently only support vector and matrix.
- /// if 'axis' is 0, average all rows into a single row
- /// if 'axis' is 1, average all columns into a single column
- /// TODO(wangwei) support arbitrary Tensor like numpy.average
- Tensor Average(const Tensor &t, int axis);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- Tensor SoftMax(const Tensor &in);
- /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
- void SoftMax(const Tensor &in, Tensor *out);
+ /// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator<=(const Tensor &in, const SType x);
+ template <typename SType>
+ void LE(const Tensor &in, const SType x, Tensor *out);
 -
+ /// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>(const Tensor &in, const SType x);
+ template <typename SType>
+ void GT(const Tensor &in, const SType x, Tensor *out);
  
- /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
- /// and shape_[axis+1]*...*shape_[nDim()] columns.
- /// and do softmax along each row.
- // Tensor Softmax(const Tensor& t, int axis = -1);
- // void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
- 
- /// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<(const Tensor &t, const DType x);
- template <typename DType>
- void LT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator<=(const Tensor &t, const DType x);
- template <typename DType>
- void LE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>(const Tensor &t, const DType x);
- template <typename DType>
- void GT(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
- template <typename DType>
- Tensor operator>=(const Tensor &t, const DType x);
- template <typename DType>
- void GE(const Tensor &t, DType x, Tensor *ret);
- 
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- Tensor Pow(const Tensor &t, DType x);
- /// Element-wise opeartion, ret[i]=t[i]^x
- template <typename DType>
- void Pow(const Tensor &t, DType x, Tensor *ret);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- Tensor Pow(const Tensor &base, Tensor exp);
- /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
- void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+ /// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+ template <typename SType>
+ Tensor operator>=(const Tensor &in, const SType x);
+ template <typename SType>
+ void GE(const Tensor &in, const SType x, Tensor *out);
  
  Tensor operator+(const Tensor &lhs, const Tensor &rhs);
- void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator-(const Tensor &lhs, const Tensor &rhs);
- void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator*(const Tensor &lhs, const Tensor &rhs);
- void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+ void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  Tensor operator/(const Tensor &lhs, const Tensor &rhs);
- void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- 
- template <typename DType>
- Tensor operator+(const Tensor &t, DType x);
- template <typename DType>
- void Add(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator-(const Tensor &t, DType x);
- template <typename DType>
- void Sub(const Tensor &t, DType x, Tensor *ret);
+ void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
  
- template <typename DType>
- Tensor operator*(const Tensor &t, DType x);
- template <typename DType>
- void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
- 
- template <typename DType>
- Tensor operator/(const Tensor &t, DType x);
- template <typename DType>
- void Div(const Tensor &t, DType x, Tensor *ret);
- 
- // ================Blas operations============================================
- // We fix the scalar argument type to be float.
+ template <typename SType>
+ Tensor operator+(const Tensor &in, const SType x);
+ template <typename SType>
+ void Add(const Tensor &in, const SType x, Tensor *out);
  
- // ===== Level 1
- // TODO(wangwei) make amax/amin/asum a member function of tensor
- // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
- // void Asum(Tensor Context* ctx);
+ template <typename SType>
+ Tensor operator-(const Tensor &in, const SType x);
+ template <typename SType>
+ void Sub(const Tensor &in, const SType x, Tensor *out);
  
- // template <typename DType>
- // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+ template <typename SType>
+ Tensor operator*(const Tensor &in, const SType x);
+ template <typename SType>
+ void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  result = A * B
- Tensor Mult(const Tensor &A, const Tensor &B);
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape.  C = A * B
- void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+ /// For each element e of Tensor 'in', compute e / x
+ template <typename SType>
+ Tensor operator/(const Tensor &in, const SType x);
+ /// For each element e of Tensor 'in', compute e / x into out
+ template <typename SType>
+ void Div(const Tensor &in, const SType x, Tensor *out);
  
- /// Do matrix vector multipication or matrix matrix multiplication depdending
- /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
- void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
-           const float beta, Tensor *C);
+ /// For each element e of Tensor 'in', compute x/e
+ template <typename SType>
+ Tensor Div(const SType x, const Tensor &in);
+ /// For each element e of Tensor 'in', compute x/e into 'out'
+ template <typename SType>
+ void Div(const SType x, const Tensor &in, Tensor *out);
  
- // ================Random operations==========================================
- /// For each element x set x = 1 if random() < p; otherwise x = 1.
- void Bernoulli(float p, Tensor *t);
- /// Fill in Tensor 't' following uniform distribution.
- void Uniform(float low, float high, Tensor *t);
- /// Fill in Tensor 't' following Gaussian distribution.
- void Gaussian(float mean, float std, Tensor *t);
+ template <typename SType>
+ SType Sum(const Tensor &in);
 -
+ // ============Matrix (row/column) operations==================================
+ /// Average elements in the Tensor, currently only support vector and matrix.
+ /// if 'axis' is 0, average all rows into a single row
+ /// if 'axis' is 1, average all columns into a single column
+ /// TODO(wangwei) support arbitrary Tensor like numpy.average
+ Tensor Average(const Tensor &in, const int axis);
 -/// Sum elements in the Tensor, currently only support vector and matrix.
 -/// if 'axis' is 0, sum all rows into a single row
 -/// if 'axis' is 1, sum all columns into a single column
 -/// TODO(wangwei) support arbitrary Tensor like numpy.sum
 -Tensor Sum(const Tensor &in, const int axis);
 -/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
 -/// and shape_[axis]*...*shape_[nDim()] columns.
 -/// and do softmax along each row.
 -Tensor SoftMax(const Tensor &in, const int axis = 0);
 -void SoftMax(const Tensor &in, const int axis, Tensor *out);
  
- // follow the consistency guide
- // https://issues.apache.org/jira/browse/SINGA-182
- // ============Matrix vector operations=======================================
  /// Add column 'v' with each column of matrix M
  void AddColumn(const Tensor &v, Tensor *M);
- void AddColumn(const float alpha, const float beta, const Tensor &v,
+ /// For each column 'c' of matrix out, do c=alpha*v + beta*c
+ template <typename SType>
+ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                 Tensor *out);
- /// Sub column 'v' by each column of matrix M
- void SubColumn(const Tensor &v, Tensor *M);
- /// Multiply column 'v' and each column of matrix M; write results into 'out'
- void MultColumn(const Tensor &v, Tensor *M);
- /// Divide column 'v' by each column of matrix M; write results into 'out'
- void DivColumn(const Tensor &v, Tensor *M);
- 
  /// Add row 'v' with each row of matrix M; write results into 'out'
  void AddRow(const Tensor &v, Tensor *out);
- void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
- /// Sub row 'v' by each row of matrix M; write results into 'out'
- void SubRow(const Tensor &v, Tensor *M);
- /// Multiply row 'v' with each row of matrix M; write results into 'out'
- void MultRow(const Tensor &v, Tensor *M);
+ /// For each row 'r' of matrix out, do r=alpha*v + beta*r
+ template <typename SType>
+ void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+ /// Divide column 'v' by each column of matrix M; write results into 'out'
+ void DivColumn(const Tensor &v, Tensor *M);
  /// Divide row 'v' by each row of matrix M; write results into 'out'
  void DivRow(const Tensor &v, Tensor *M);
- 
- /// Sum all rows of matrix M into a single row as 'out'
- void SumRows(const Tensor &M, Tensor *out);
+ /// Multiply column 'v' and each column of matrix M; write results into 'out'
+ void MultColumn(const Tensor &v, Tensor *M);
+ /// Multiply row 'v' with each row of matrix M; write results into 'out'
+ void MultRow(const Tensor &v, Tensor *M);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++Tensor SoftMax(const Tensor &in);
++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
++void SoftMax(const Tensor &in, Tensor *out);
+ /// Sub column 'v' by each column of matrix M
+ void SubColumn(const Tensor &v, Tensor *M);
+ /// Sub row 'v' by each row of matrix M; write results into 'out'
+ void SubRow(const Tensor &v, Tensor *M);
  /// Sum all columns of matrix M into a single column as 'out'
  void SumColumns(const Tensor &M, Tensor *out);
+ /// Sum all rows of matrix M into a single row as 'out'
+ void SumRows(const Tensor &M, Tensor *out);
+ 
++/// Sum elements in the Tensor, currently only support vector and matrix.
++/// if 'axis' is 0, sum all rows into a single row
++/// if 'axis' is 1, sum all columns into a single column
++/// TODO(wangwei) support arbitrary Tensor like numpy.sum
++Tensor Sum(const Tensor &in, const int axis);
++
+ // ================Random operations==========================================
+ /// For each element x set x = 1 if random() < p; otherwise x = 1.
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out);
+ /// Fill in Tensor 't' following Gaussian distribution.
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out);
+ /// Fill in Tensor 't' following uniform distribution.
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out);
+ 
+ // ================Blas operations============================================
+ // TODO(wangwei) make amax/amin/asum a member function of tensor
  
- /// For each element x of Tensor 'in', compute alpha/x
+ /// out = alpha*in + out
  template <typename SType>
- Tensor Div(const SType alpha, const Tensor &in);
+ void Axpy(SType alpha, const Tensor &in, Tensor *out);
  
- /// For each element x of Tensor 'in', compute alpha/x into 'out'
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  result = A * B
+ Tensor Mult(const Tensor &A, const Tensor &B);
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape.  C = A * B
+ void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 -
+ /// Do matrix vector multipication or matrix matrix multiplication depdending
+ /// on the Tensor shape. out = alpha lhs * rhs + beta * out
  template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out);
- 
- /*
- /// Multiply each column of the lhs matrix with the rhs column
- Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
- void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Multiply each row of the lhs matrix with the rhs row
- Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
- void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Div each row of the lhs matrix with the rhs column
- Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
- void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- /// Divide each row of the lhs matrix by the rhs row
- Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
- void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
- */
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C);
 +
++// *****************
++// Misc.
++// ****************
 +/// Compute the cross entropy loss given the prediction probability 'p' and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
- void ComputeCrossEntropy(const Tensor& t, Tensor* p);
++void ComputeCrossEntropy(const Tensor& p, const Tensor& t, Tensor* loss);
 +/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
 +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 +/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
 +void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p);
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/CMakeLists.txt
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.cu
index f12763e,b618f9b..21ebdd8
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@@ -309,142 -258,170 +258,202 @@@ __global__ void KernelLE(const int num
    }
  }
  
- __global__ static void kernel_set_value(float *data, float value, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     data[index] = value;
+ __global__ void KernelLT(const int num, const float *in, const float x,
+                          float *out) {
+   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+        idx += blockDim.x * gridDim.x) {
+     out[idx] = in[idx] < x ? 1.0f : 0.0f;
+   }
+ }
++__global__ void KernelComputeCrossEntropy(const size_t batchsize,
++                                          const size_t dim, const float *p,
++                                          const int *t, float *loss) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    float prob_of_truth = p[sample * dim + t[sample]];
++    loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
 +  }
 +}
  
- __global__ void kernel_threshold(const float *src_data, float *des_data,
-                                  float alpha, int n) {
-   int index = blockIdx.x * blockDim.x + threadIdx.x;
-   int num_threads = blockDim.x * gridDim.x;
-   for (; index < n; index += num_threads) {
-     des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
++__global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize,
++                                             const size_t dim, const float *p,
++                                             const int *t, float *grad) {
++  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
++  size_t num_threads = blockDim.x * gridDim.x;
++  for (; sample < batchsize; sample += num_threads) {
++    size_t pos = sample * dim + t[sample];
++    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
 +  }
 +}
- void sum(int n, const float *in, float *out) {
-   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-   //  here, we only need one block
-   int num_blocks = 1;
+ // ********************************
+ // Functions call kernels
+ // ********************************
+ 
+ void set(const size_t n, const float v, float *out, cudaStream_t s) {
+   KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
+ }
+ 
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-   int num_blocks = cols;
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
+ 
+ void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_row << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
  
- void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-   int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-   int num_blocks = rows;
+ void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+ }
  
-   kernel_sum_col << <num_blocks, threads_per_block>>>
-       (in, out, rows, cols, stride);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out) {
-   dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-   dim3 num_blocks(
-       cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-       rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-   kernel_add_vec_row << <num_blocks, threads_per_block>>>
-       (in_row, in_mat, out, rows, cols, stride);
+ 
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void add(int n, const float *a, const float *b, float *out) {
-   kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+   KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void sub(int n, const float *a, const float *b, float *out) {
-   kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+   KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
  }
- void exp(int n, const float *in, float *out) {
-   kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s) {
+   KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
  }
  
- void log(int n, const float *in, float *out) {
-   kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid(int n, const float *in, float *out) {
-   kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void sigmoid_grad(int n, const float *in, float *out) {
-   kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
  }
  
- void relu(int n, const float *in, float *out) {
-   kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float x, const float *in, float *out,
+           cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void relu_grad(int n, const float *in, float *out) {
-   kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s) {
+   KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
  }
  
- void tanh(int n, const float *in, float *out) {
-   kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+ }
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s) {
+   KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
  }
  
- void tanh_grad(int n, const float *in, float *out) {
-   kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus(int n, const float *in, float *out) {
-   kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void softplus_grad(int n, const float *in, float *out) {
-   kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square(int n, const float *in, float *out) {
-   kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s) {
+   KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void square_grad(int n, const float *in, float *out) {
-   kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+   KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
  }
  
- void sqrt(int n, const float *in, float *out) {
-   kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+   int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+   //  here, we only need one block
+   int num_blocks = 1;
+   KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
  }
 +
- void pow(int n, const float *a, const float *b, float *out) {
-   kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p,
++                         const int *t, float *loss, cudaStream_t stream) {
++  KernelComputeCrossEntropy <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, loss);
 +}
 +
- void mult(int n, const float *a, const float *b, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
++void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p,
++                            const int *t, float *grad, cudaStream_t stream) {
++  KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
++      (batchsize, dim, p, t, grad);
++}
+ /*
+ void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void mult(int n, const float *a, const float x, float *out) {
-   kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+ void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void div(int n, const float *a, const float *b, float *out) {
-   kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+ 
+ void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void set_value(int n, float v, float *out) {
-   kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+ 
+ void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- void threshold(int n, float alpha, const float *in, float *out) {
-   kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+ void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+   kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
  }
  
- // follow the consistency guide for math API
- __global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                           float *out) {
-   for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-        idx += blockDim.x * gridDim.x) {
-     out[idx] = alpha / in[idx];
+ 
+ __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                                int rows, int cols, int stride) {
+   int index = blockIdx.x * blockDim.x + threadIdx.x;
+   int num_threads = blockDim.x * gridDim.x;
+   for (; index < rows; index += num_threads) {
+     dst_vec_data[index] = 0.0f;
+     for (int k = 0; k < cols; k++) {
+       dst_vec_data[index] += src_mat_data[index * stride + k];
+     }
    }
  }
  

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --cc src/core/tensor/math_kernel.h
index 09953e4,d8a58a5..976b78f
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@@ -31,72 -31,66 +31,73 @@@ namespace singa 
  
  // TODO(wangwei) make all function templates.
  namespace cuda {
- void sum(int n, const float *in, float *out);
  
- void sum_row(int rows, int cols, int stride, const float *in, float *out);
- 
- void sum_col(int rows, int cols, int stride, const float *in, float *out);
- 
- void add_row(int rows, int cols, int stride, const float *in_row,
-              const float *in_mat, float *out);
- 
- void add(int n, const float *a, const float *b, float *out);
- 
- void sub(int n, const float *a, const float *b, float *out);
- 
- void exp(int n, const float *in, float *out);
- 
- void log(int n, const float *in, float *out);
- 
- void sigmoid(int n, const float *in, float *out);
- 
- void sigmoid_grad(int n, const float *in, float *out);
- 
- void relu(int n, const float *in, float *out);
- 
- void relu_grad(int n, const float *in, float *out);
- 
- void tanh(int n, const float *in, float *out);
- 
- void tanh_grad(int n, const float *in, float *out);
+ // 0 input
+ void set(const size_t n, const float v, float *out, cudaStream_t s);
+ 
+ // 1 input
+ void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+ void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+ void log(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+ void square(const size_t n, const float *in, float *out, cudaStream_t s);
+ void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+ void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+ void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+ void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+ void clamp(const size_t n, const float low, const float high, const float *in,
+            float *out, cudaStream_t s);
+ 
+ void pow(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus(int n, const float *in, float *out);
+ void add(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
  
- void softplus_grad(int n, const float *in, float *out);
+ void mult(const size_t n, const float *in, const float x, float *out,
+           cudaStream_t s);
  
- void square(int n, const float *in, float *out);
+ void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s);
  
- void square_grad(int n, const float *in, float *out);
+ void threshold(const size_t n, const float x, const float *in, float *out,
+                cudaStream_t s);
  
- void sqrt(int n, const float *in, float *out);
+ void gt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void ge(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void lt(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
+ void le(const size_t num, const float *in, const float x, float *out,
+         cudaStream_t s);
  
- void pow(int n, const float *a, const float *b, float *out);
+ // 2 inputs
+ void pow(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float *b, float *out);
+ void add(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void mult(int n, const float *a, const float x, float *out);
+ void sub(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void div(int n, const float *a, const float *b, float *out);
+ void mult(const size_t n, const float *in1, const float *in2, float *out,
+           cudaStream_t s);
  
- void set_value(int n, float v, float *out);
+ void div(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
  
- void threshold(int n, float alpha, const float *in, float *out);
+ void sum(const size_t n, const float *in, float *out, cudaStream_t s);
  
- // follow the consistency guide for math API
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const float *p, const int *t, float *loss,
 +                         cudaStream_t stream);
- void Div(const size_t num, const float x, const float *in, float *out,
-          cudaStream_t s);
- void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
- void Set(const size_t num, const float x, float *out, cudaStream_t s);
 +void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
 +                            const float *p, const int *t, float *grad,
 +                            cudaStream_t stream);
 +
  }  // cuda
  
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index 1ac25c6,e62386a..4e0d94b
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -592,42 -557,8 +556,8 @@@ void AddRow(const SType alpha, const ST
      Mult(alpha, one, vmat, beta, M);
    }
  }
- void ComputeCrossEntropy(const Tensor& t, Tensor* p) {
-   CHECK_LE(p->nDim(), 2u);
-   CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
-   size_t batchsize = 1;
-   if (p->nDim() == 2u) batchsize = p->shape(0);
-   size_t dim = p->Size() / batchsize;
-   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
-     p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
-       ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
-                                        p->blob(), ctx);
-     }, {p->blob(), t.blob()}, {p->blob()});
-   });
- }
- 
- template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
-   Tensor out(in.shape(), in.device(), in.data_type());
-   Div(alpha, in, &out);
-   return out;
- }
- 
- template Tensor Div<float>(const float, const Tensor &);
- 
- template <typename SType>
- void Div(const SType alpha, const Tensor &in, Tensor *out) {
-   CheckDataTypeAndLang(in, *out);
-   CHECK(in.shape() == out->shape());
-   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-     // TODO(wangwei) type cast SType to DType;
-     in.device()->Exec(
-         [alpha, in, out](Context *ctx) {
-           Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
-         },
-         {in.blob()}, {out->blob()});
-   });
- }
- template void Div<float>(const float, const Tensor &, Tensor *);
 -template <>
++template
+ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
  
  /// Divide column 'v' by each column of matrix M; write results into 'out'
  void DivColumn(const Tensor &v, Tensor *M) {
@@@ -725,4 -639,92 +638,122 @@@ void SumRows(const Tensor &M, Tensor *v
      Mult(X, one, v);
    }
  }
+ // ====================Random operations=====================================
+ template <typename SType>
+ void Bernoulli(const SType p, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto prob = TypeCast<SType, DType>(p);
+     out->device()->Exec([prob, out](Context *ctx) {
+       Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Bernoulli<float>(const float p, Tensor *out);
+ 
+ template <typename SType>
+ void Uniform(const SType low, const SType high, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto l = TypeCast<SType, DType>(low);
+     auto h = TypeCast<SType, DType>(high);
+     out->device()->Exec([l, h, out](Context *ctx) {
+       Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Uniform<float>(const float low, const float high, Tensor *out);
+ 
+ template <typename SType>
+ void Gaussian(const SType mean, const SType std, Tensor *out) {
+   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+     auto m = TypeCast<SType, DType>(mean);
+     auto s = TypeCast<SType, DType>(std);
+     out->device()->Exec([m, s, out](Context *ctx) {
+       Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+     }, {}, {out->blob()}, true);
+   });
+ }
+ template void Gaussian<float>(const float mean, const float std, Tensor *out);
+ 
+ // ================Blas operations============================================
+ template <typename SType>
+ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+     auto a = TypeCast<SType, DType>(alpha);
+     out->device()->Exec([a, in, out](Context *ctx) {
+       Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+     }, {in.blob(), out->blob()}, {out->blob()});
+   });
+ }
 -template <>
 -void Axpy(const float alpha, const Tensor &in, Tensor *out);
++template void Axpy(const float alpha, const Tensor &in, Tensor *out);
+ 
+ Tensor Mult(const Tensor &A, const Tensor &B) {
+   Shape s;
+   s.push_back(A.shape(0));
+   if (B.nDim() == 2) s.push_back(B.shape(1));
+   Tensor out(s, A.device(), A.data_type());
+   Mult(A, B, &out);
+   return out;
+ }
+ 
+ void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+   Mult(1.0f, A, B, 0.0f, out);
+ }
+ 
+ template <typename SType>
+ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+           Tensor *C) {
+   CHECK_EQ(A.shape().size(), 2u);
+   if (B.nDim() == 1u) {
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+                           B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   } else {
+     CHECK(!C->transpose());
+     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+       auto a = TypeCast<SType, DType>(alpha);
+       auto b = TypeCast<SType, DType>(beta);
+       C->device()->Exec([a, A, b, B, C](Context *ctx) {
+         GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                           A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+       }, {A.blob(), B.blob()}, {C->blob()});
+     });
+   }
+ }
+ 
++
++// ************************
++// Misc.
++// ***********************
++void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
++  CHECK_LE(p.nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p.nDim() == 2u) batchsize = p.shape(0);
++  size_t dim = p.Size() / batchsize;
++  TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
++    p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
++      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.blob(), t.blob(),
++                                       loss->blob(), ctx);
++    }, {p.blob(), t.blob()}, {loss->blob()});
++  });
++}
++void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
++  CHECK_LE(p->nDim(), 2u);
++  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
++  size_t batchsize = 1;
++  if (p->nDim() == 2u)
++    batchsize = p->shape(0);
++  size_t dim = p->Size() / batchsize;
++  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
++    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
++      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(),
++                                          p->blob(), ctx);
++    }, {p->blob(), t.blob()}, {p->blob()});
++  });
++}
  }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math.h
index bcf4908,b86e1cb..12490d1
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@@ -269,109 -293,74 +293,95 @@@ void Scale(const size_t num, const DTyp
  template <typename DType, typename Lang>
  void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+   LOG(FATAL) << "Dot Not Implemented";
  }
  
- // ===== Level 2
- /// ret = alpha * op(A) * v + beta * ret.
- /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+ /// out = alpha * A * v + beta * out.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
-           DType beta, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+           const Blob *A, const Blob *v, const DType beta, Blob *out,
+           Context *ctx) {
+   LOG(FATAL) << "GEMV Not Implemented";
  }
  
- // ===== Level 3
- 
- // ================Random functions===========================================
- /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
- // Get the random generator from 'ctx'
- // If DType is not float, then convert the threshold to DType
- template <typename DType, typename Lang>
- void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the low and high to DType
+ /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+ /// if matrix_lef_side is true, do M*v; else do v*M
  template <typename DType, typename Lang>
- void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+   LOG(FATAL) << "DGMM Not Implemented";
  }
- // The random generator should be extracted from ctx.
- // If DType is not float, then convert the mean and std to DType
+ 
+ /// C = alpha * A * B + beta * C.
+ /// transA indicates if the internal data layout is transposed of A
  template <typename DType, typename Lang>
- void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void GEMM(const bool transA, const bool transB, const size_t nrowA,
+           const size_t ncolB, const size_t ncolA, const DType alpha,
+           const Blob *A, const Blob *B, const DType beta, Blob *C,
+           Context *ctx) {
+   LOG(FATAL) << "GEMM Not Implemented";
  }
  
- // ========follow the consistency guide of math API
- 
 +/// Divide alpha by each element of 'in'.
 +// following the consistency guide.
 +template <typename DType, typename Lang>
 +void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
 +                         const Blob *p, const Blob *t, Blob *loss,
 +                         Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
- template <typename DType, typename Lang>
- void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
-          Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
 +
- /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
- /// if matrix_lef_side is true, do M*v; else do v*M
 +template <typename DType, typename Lang>
- void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
-           const Blob *M, const Blob *v, Blob *out, Context *ctx) {
++void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
++                            const Blob *p, const Blob *t, Blob *grad,
++                            Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
- /// C = alpha * A * B + beta * C.
- /// transA indicates if the internal data layout is transposed of A
+ // **************************************
+ // Matrix functions
+ // **************************************
+ /*
+ /// Add the vector v to every column of A as the column of out
  template <typename DType, typename Lang>
- void GEMM(const bool transA, const bool transB, const size_t nrowA,
-           const size_t ncolB, const size_t ncolA, const DType alpha,
-           const Blob *A, const Blob *B, const DType beta, Blob *C,
-           Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddCol Not Implemented";
  }
- /// ret[i]=(input[i]<x)?1.f:0.f
+ // TODO(wangwei) unify AddRow and AddCol.
+ /// Add the vector v to every row of A as the row of out
  template <typename DType, typename Lang>
- void LT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+             Blob *out, Context *ctx) {
+   LOG(FATAL) << "AddRow Not Implemented";
  }
- /// ret[i]=(input[i]<=x)?1.f:0.f
- template <typename DType, typename Lang>
- void LE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
- }
- /// ret[i]=(input[i]>x)?1.f:0.f
+ /// outer-product.
+ /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
  template <typename DType, typename Lang>
- void GT(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+            Blob *out, Context *ctx) {
+   LOG(FATAL) << "Outer Not Implemented";
  }
- /// ret[i]=(input[i]>=x)?1.f:0.f
+ 
+ /// Sum the columns of the in matrix into a vector
  template <typename DType, typename Lang>
- void GE(const size_t num, const Blob *in, const DType x, Blob *out,
-         Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                 Context *ctx) {
+   LOG(FATAL) << "SumColumns Not Implemented";
  }
 +template <typename DType, typename Lang>
 +void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
 +  LOG(FATAL) << "Not Implemented";
 +}
 +
+ // TODO(wangwei) unify SumRow and SumCol.
+ /// Sum the rows of the in matrix into a vector
  template <typename DType, typename Lang>
- void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
-                             const Blob *p, const Blob *t, Blob *grad,
-                             Context *ctx) {
-   LOG(FATAL) << "Not Implemented";
+ void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+              Context *ctx) {
+   LOG(FATAL) << "SumRows Not Implemented";
  }
- 
+ */
  }  // namespace singa
  #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cpp.h
index 907c656,0b280a3..c5d092b
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@@ -323,32 -420,196 +422,229 @@@ void GEMM<float, lang::Cpp>(const bool 
    cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
                lda, BPtr, ldb, beta, CPtr, ldc);
  }
- #endif  // USE_CBLAS
+ 
+ #else
  
  template <>
- void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
+ void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t maxPos = 0;
+   float maxVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       maxVal = inPtr[i];
+     } else if (inPtr[i] > maxVal) {
+       maxVal = inPtr[i];
+       maxPos = i;
+     }
+   }
+   *out = maxPos;
+ }
+ template <>
+ void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                             Context *ctx) {
+   size_t minPos = 0;
+   float minVal = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     if (i == 0) {
+       minVal = inPtr[i];
+     } else if (inPtr[i] > minVal) {
+       minVal = inPtr[i];
+       minPos = i;
+     }
+   }
+   *out = minPos;
+ }
+ 
+ template <>
+ void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                             Context *ctx) {
+   float sum = 0;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += fabs(inPtr[i]);
+   }
+ }
+ 
+ template <>
+ void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+                             Blob *out, Context *ctx) {
    float *outPtr = static_cast<float *>(out->mutable_data());
-   for (size_t i = 0; i < num; i++) outPtr[i] = x;
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] += alpha * inPtr[i];
+   }
  }
+ 
+ template <>
+ void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                              Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   for (size_t i = 0; i < num; i++) {
+     outPtr[i] *= x;
+   }
+ }
+ 
+ template <>
+ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+                            float *out, Context *ctx) {
+   float sum = 0;
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t i = 0; i < num; i++) {
+     sum += in1Ptr[i] * in2Ptr[i];
+   }
+ }
+ 
+ template <>
+ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Blob *A, const Blob *v,
+                             const float beta, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < m; r++) {
+     float sum = 0;
+     for (size_t c = 0; c < n; c++) {
+       size_t idx = trans ? c * m + r : r * n + c;
+       sum += APtr[idx] * vPtr[c];
+     }
+     outPtr[r] = alpha * sum + beta * outPtr[r];
+   }
+ }
+ 
+ #endif  // USE_CBLAS
++template <>
++void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
++                                           const size_t dim, const Blob *p,
++                                           const Blob *t, Blob *loss,
++                                           Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  for (size_t i = 0; i < batchsize; i++) {
++    int truth_idx = tPtr[i];
++    CHECK_GE(truth_idx, 0);
++    float prob_of_truth = pPtr[i * dim + truth_idx];
++    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
++  }
++}
++
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
 +                                              const size_t dim, const Blob *p,
 +                                              const Blob *t,
 +                                              Blob *grad, Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  // const float* pPtr = static_cast<const float*>(p->data());
-   const float *tPtr = static_cast<const float *>(t->data());
++  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +
 +  for (size_t i = 0; i < batchsize; i++) {
 +    int truth_idx = static_cast<int>(tPtr[i]);
 +    CHECK_GE(truth_idx, 0);
 +    gradPtr[i * dim + truth_idx] -= 1.0;
 +  }
 +}
 +
  
+ // =========Matrix operations ================================================
+ /*
+ template <>
+ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+     }
+   }
+ }
+ 
+ template <>
+ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *A, const Blob *v, Blob *out,
+                               Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *APtr = static_cast<const float *>(A->data());
+   const float *vPtr = static_cast<const float *>(v->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+     }
+   }
+ }
+ template <>
+ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                              const Blob *in2, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *in1Ptr = static_cast<const float *>(in1->data());
+   const float *in2Ptr = static_cast<const float *>(in2->data());
+   for (size_t r = 0; r < m; r++) {
+     size_t offset = r * n;
+     for (size_t c = 0; c < n; c++) {
+       outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+     }
+   }
+ }
+ template <>
+ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   float *bPtr = new float[ncol];
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     float denom = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       bPtr[c] = exp(inPtr[offset + c]);
+       denom += bPtr[c];
+     }
+     for (size_t c = 0; c < ncol; c++) {
+       size_t idx = offset + c;
+       outPtr[idx] = bPtr[c] / denom;
+     }
+   }
+   delete bPtr;
+ }
+ 
+ template <>
+ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                   const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t c = 0; c < ncol; c++) {
+     outPtr[c] = 0.f;
+   }
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[c] += inPtr[offset + c];
+     }
+   }
+ }
+ 
+ template <>
+ void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                const Blob *in, Blob *out, Context *ctx) {
+   float *outPtr = static_cast<float *>(out->mutable_data());
+   const float *inPtr = static_cast<const float *>(in->data());
+   for (size_t r = 0; r < nrow; r++) {
+     size_t offset = r * ncol;
+     outPtr[r] = 0.f;
+     for (size_t c = 0; c < ncol; c++) {
+       outPtr[r] += inPtr[offset + c];
+     }
+   }
+ }
+ */
  }  // namespace singa
  
  #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor_math_cuda.h
index c69620c,e2597d5..9a8839e
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@@ -131,55 -398,6 +398,28 @@@ void GEMM<float, lang::Cuda>(const boo
                             BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
  }
  
 +template <>
- void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::GT(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out, Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LE(num, inPtr, x, outPtr, ctx->stream);
- }
- template <>
- void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
-                                    Blob* out,  Context *ctx) {
-   float* outPtr = static_cast<float*>(out->mutable_data());
-   const float* inPtr = static_cast<const float*>(in->data());
-   cuda::LT(num, inPtr, x, outPtr, ctx->stream);
- }
- 
- template<>
- void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-   float *outPtr = static_cast<float *>(out->mutable_data());
-   cuda::Set(num, x, outPtr, ctx->stream);
++void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
++                                            const size_t dim, const Blob *p,
++                                            const Blob *t, Blob *loss,
++                                            Context *ctx) {
++  const float *pPtr = static_cast<const float *>(p->data());
++  const int *tPtr = static_cast<const int *>(t->data());
++  float *lossPtr = static_cast<float *>(loss->mutable_data());
++  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
 +}
- 
 +template <>
 +void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
 +                                               const size_t dim, const Blob *p,
 +                                               const Blob *t, Blob *grad,
 +                                               Context *ctx) {
 +  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
 +  const float *pPtr = static_cast<const float *>(p->data());
 +  const int *tPtr = static_cast<const int *>(t->data());
 +  float *gradPtr = static_cast<float *>(grad->mutable_data());
 +  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
 +                               ctx->stream);
 +}
- 
  }  // namespace singa
  
  #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --cc src/model/layer/softmax.cc
index 8af1d76,b379fc1..f554f25
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@@ -25,14 -25,16 +25,17 @@@ void Softmax::Setup(const LayerConf& co
  }
  
  const Tensor Softmax::Forward(int flag, const Tensor& input) {
+   Tensor output;
    if (input.nDim() == 1) {
-     buf_.push(SoftMax(input));
 -    Tensor tmp = Reshape(input, Shape{1, input.Size()});
 -      output = SoftMax(tmp, 0);
++    output = SoftMax(input);
    } else {
 -    output = SoftMax(input, axis_);
 +    size_t nrow = Product(input.shape(), 0, axis_);
 +    const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow});
-     buf_.push(SoftMax(tmp));
++    output = SoftMax(tmp);
    }
-   return buf_.top();
+   if (flag & kTrain)
+     buf_.push(output);
+   return output;
  }
  
  const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --cc src/model/loss/softmax_cross_entropy.cc
index 4ca323a,0000000..bed3348
mode 100644,000000..100644
--- a/src/model/loss/softmax_cross_entropy.cc
+++ b/src/model/loss/softmax_cross_entropy.cc
@@@ -1,53 -1,0 +1,53 @@@
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +#include <stack>
 +#include "singa/model/loss.h"
 +
 +namespace singa {
 +
- 
- Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) {
++Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction,
++                                    const Tensor& target) {
 +  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
 +                      << " The calling pattern is [Forward|Evaluate] Backward";
 +  size_t batchsize = 1;
 +  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
 +  size_t dim = prediction.Size() / batchsize;
 +  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
 +  Tensor prob = SoftMax(input);
 +
 +  // buffer intermediate data
 +  buf_.push(prob);
 +  buf_.push(target);
-   Tensor loss = prob.Clone();
++  Tensor loss(Shape{batchsize}, prob.device(), prob.data_type());
 +
-   ComputeCrossEntropy(target, &loss);
++  ComputeCrossEntropy(prob, target, &loss);
 +  return loss;
 +}
 +
 +Tensor SoftmaxCrossEntropy::Backward() {
 +  const Tensor target = buf_.top();
 +  buf_.pop();
 +  Tensor prob = buf_.top();
 +  buf_.pop();
 +  SoftmaxCrossEntropyBwd(target, &prob);
 +  return prob;
 +}
 +}  // namespace singa
 +
 +

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 6b8cb69,0000000..0eb36e5
mode 100644,000000..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,114 -1,0 +1,116 @@@
 +/************************************************************
 +*
 +* Licensed to the Apache Software Foundation (ASF) under one
 +* or more contributor license agreements.  See the NOTICE file
 +* distributed with this work for additional information
 +* regarding copyright ownership.  The ASF licenses this file
 +* to you under the Apache License, Version 2.0 (the
 +* "License"); you may not use this file except in compliance
 +* with the License.  You may obtain a copy of the License at
 +*
 +*   http://www.apache.org/licenses/LICENSE-2.0
 +*
 +* Unless required by applicable law or agreed to in writing,
 +* software distributed under the License is distributed on an
 +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 +* KIND, either express or implied.  See the License for the
 +* specific language governing permissions and limitations
 +* under the License.
 +*
 +*************************************************************/
 +
 +#include "gtest/gtest.h"
 +#include "singa/core/tensor.h"
 +#include "singa/core/device.h"
 +#include "singa/model/loss.h"
 +#include "singa_config.h"
 +
 +using singa::Tensor;
 +class TestSoftmaxCrossEntropy : public ::testing::Test {
 + protected:
 +  virtual void SetUp() {
 +    p.Reshape(singa::Shape{2, 4});
 +    t.Reshape(singa::Shape{2, 1});
 +  }
 +  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
-   const float tdat[2] = {0.0, 2.0};
++  const int tdat[2] = {0, 2};
 +
 +  singa::Tensor p, t;
 +};
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppForward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  const Tensor& loss = cross_entropy.Forward(p, t);
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
 +  p.CopyDataFromHostPtr(pdat, 8);
++  t.AsType(singa::kInt);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  cross_entropy.Forward(p, t);
 +  const Tensor& grad = cross_entropy.Backward();
 +
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +
 +#ifdef USE_CUDA
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  Tensor loss = cross_entropy.Forward(p, t);
 +  loss.ToHost();
 +  auto ldat = loss.data<const float*>();
 +
 +  const float result_test = -log(0.25);
 +  EXPECT_FLOAT_EQ(ldat[0], result_test);
 +  EXPECT_FLOAT_EQ(ldat[1], result_test);
 +}
 +
 +TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
 +  singa::SoftmaxCrossEntropy cross_entropy;
 +  singa::CudaGPU dev;
 +  p.ToDevice(&dev);
 +  t.ToDevice(&dev);
 +  p.CopyDataFromHostPtr(pdat, 8);
 +  t.CopyDataFromHostPtr(tdat, 2);
 +
 +  cross_entropy.Forward(p, t);
 +  Tensor grad = cross_entropy.Backward();
 +
 +  grad.ToHost();
 +  auto gdat = grad.data<const float*>();
 +  EXPECT_FLOAT_EQ(gdat[0], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[1], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[2], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[3], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[4], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[5], 0.25);
 +  EXPECT_FLOAT_EQ(gdat[6], -0.75);
 +  EXPECT_FLOAT_EQ(gdat[7], 0.25);
 +}
 +#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_mse.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_softmax.cc
index 09dfcd9,c087605..fff8510
--- a/test/singa/test_softmax.cc
+++ b/test/singa/test_softmax.cc
@@@ -36,13 -36,14 +36,14 @@@ TEST(Softmax, Setup) 
    EXPECT_EQ(2, sft.Axis());
  }
  
+ #ifdef USE_CBLAS
  TEST(Softmax, Forward) {
    const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
--  size_t n = sizeof(x) / sizeof(float);
    size_t row = 2;
    size_t col = 3;
++  size_t n = row * col;
    singa::Tensor in(singa::Shape{row, col});
--  in.CopyDataFromHostPtr<float>(x, n);
++  in.CopyDataFromHostPtr<float>(x, row * col);
  
    int axis = 1;
    Softmax sft;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 8368c55,38a9291..1092d69
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -20,6 -22,263 +22,261 @@@ class TestTensorMath : public ::testing
    const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
  };
  
+ TEST_F(TestTensorMath, MemberAbs) {
+   Tensor aa = a.Clone();
+   Tensor bb = b.Clone();
+   Tensor cc = aa - bb;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+ 
+   Tensor p = Abs(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberExp) {
+   Tensor p = Exp(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLog) {
+   Tensor p = Log(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+   EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberReLU) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = ReLU(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSigmoid) {
+   Tensor p = Sigmoid(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+   EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSign) {
+   Tensor aa = a.Clone();
+   Tensor cc = aa - 2.0f;
+   const float *dptr = cc.data<const float *>();
+   EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+   EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+   EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+ 
+   Tensor p = Sign(cc);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_EQ(0.0f, dptr1[0]);
+   EXPECT_EQ(0.0f, dptr1[1]);
+   EXPECT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSqrt) {
+   Tensor p = Sqrt(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSquare) {
+   Tensor p = Square(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+   EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberTanh) {
+   Tensor p = Tanh(a);
+   const float *dptr1 = p.data<const float *>();
+   EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+   EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, Sum) {
+   Tensor p1 = Sum(e, 0);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
+ 
+   Tensor p2(Shape{3, 1});
+   p2 = Sum(e, 1);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+   EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+   EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, SoftMax) {
 -  Tensor p1(Shape{3, 2});
 -  p1 = SoftMax(e, 0);
++  Tensor p1 = SoftMax(Reshape(e, Shape{1, 6}));
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 6; i++) sum += exp(i + 1);
+   EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+   EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+   EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+   EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+   EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+   EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+ 
 -  Tensor p2(Shape{3, 2});
 -  p2 = SoftMax(e, 1);
++  Tensor p2 = SoftMax(e);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+   EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLT) {
+   Tensor p1 = a < 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberLE) {
+   Tensor p1 = a <= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGT) {
+   Tensor p1 = a > 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGE) {
+   Tensor p1 = a >= 2.0f;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberPow) {
+   Tensor p1 = Pow(b, 3.0f);
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+   EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+   EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
+ 
+   // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+   // function is complete
+   // Tensor p2 = Pow(a,b);
+   // const float *dptr2 = p2.data<const float *>();
+   // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+   // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+   // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+ }
+ 
+ TEST_F(TestTensorMath, MemberSub) {
+   Tensor p1 = a - b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberEltwiseMult) {
+   Tensor p1 = a * b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberDiv) {
+   Tensor p1 = a / b;
+   const float *dptr1 = p1.data<const float *>();
+   EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+   EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+   EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
+ 
+   Tensor p2 = Div(10.0f, b);
+   const float *dptr2 = p2.data<const float *>();
+   EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+   EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+   EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
+ 
+   Tensor p3 = a / 8.0f;
+   const float *dptr3 = p3.data<const float *>();
+   EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+   EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+   EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
+ }
+ 
+ TEST_F(TestTensorMath, MemberBernoulli) {
+   Tensor p1(Shape{10000});
+   Bernoulli(0.3f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.3f, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
+ }
+ 
+ TEST_F(TestTensorMath, MemberUniform) {
+   Tensor p1(Shape{10000});
+   Uniform(0.1f, 0.2f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 10000; i++) sum += dptr1[i];
+   float mean = sum / 10000;
+   EXPECT_NEAR(mean, 0.15f, 1e-3);
+ 
+   sum = 0;
+   for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 9999;
+   EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
+ }
+ 
+ TEST_F(TestTensorMath, MemberGaussian) {
+   Tensor p1(Shape{50000});
+   Gaussian(0.0f, 1.0f, &p1);
+   const float *dptr1 = p1.data<const float *>();
+   float sum = 0;
+   for (int i = 0; i < 50000; i++) sum += dptr1[i];
+   float mean = sum / 50000;
+   EXPECT_NEAR(mean, 0.0, 1e-2);
+ 
+   sum = 0;
+   for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+   float variance = sum / 49999;
+   EXPECT_NEAR(variance, 1.0, 1e-2);
+ }
+ 
  TEST_F(TestTensorMath, MemberAddTensor) {
    Tensor aa = a.Clone();
    aa += a;