You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/12 07:27:54 UTC
[2/5] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

SINGA-182 Clean math function APIs and implementations

Merge branch 'cuda' from #jinyangturbo.
Clean the cuda related code (tensor_math_cuda.h, kernel_math.h and kernel_math.cu)
by unify the function arugments (names and arg order).
Need to reorder the functions.
Add Nrm2 for L2 norm using cblas and cublas.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d69047a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d69047a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d69047a

Branch: refs/heads/dev
Commit: 6d69047addc46e5c9f381b7e1d4cebd20ce9b2e3
Parents: 564c88a
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 12:08:48 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |   2 +
 src/core/tensor/math_kernel.cu     | 656 +++++++++++++++++---------------
 src/core/tensor/math_kernel.h      |  93 ++---
 src/core/tensor/tensor.cc          |  14 +
 src/core/tensor/tensor_math.h      | 140 ++++---
 src/core/tensor/tensor_math_cpp.h  | 227 ++++++-----
 src/core/tensor/tensor_math_cuda.h | 384 +++++++++++++++----
 test/singa/test_tensor_math.cc     | 346 ++++++++---------
 8 files changed, 1092 insertions(+), 770 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 82bbe81..cd750c5 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -173,6 +173,8 @@ class Tensor {
   template <typename SType>
   Tensor &operator/=(const SType x);
 
+  float L2() const;
+
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..b618f9b 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -35,36 +35,16 @@
 namespace singa {
 // Cuda Kernel Functions
 namespace cuda {
-__global__ void kernel_softmax_loss(const float *prob, const int *label,
-                                    float *loss, int n, int dim) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    float prob_of_truth = prob[index * dim + label[index]];
-    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
-  }
-}
-
-__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
-                                        int dim, float scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    int pos = index * dim + label[index];
-    grad[pos] = (grad[pos] - 1.0f) * scale;
-  }
-}
-
-__global__ void kernel_sum_vec(const float *data, float *sum, int n) {
+__global__ void KernelSum(const size_t n, const float *in, float *out) {
   int THREADS = blockDim.x;
 
   __shared__ float aux[CU1DBLOCK];
   int steps = (n - 1) / THREADS + 1;
-  aux[threadIdx.x] = data[threadIdx.x];
+  aux[threadIdx.x] = in[threadIdx.x];
 
   for (int i = 1; i < steps; ++i) {
     if (threadIdx.x + i * THREADS < n) {
-      aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+      aux[threadIdx.x] += in[threadIdx.x + i * THREADS];
     }
   }
 
@@ -83,432 +63,484 @@ __global__ void kernel_sum_vec(const float *data, float *sum, int n) {
   }
 
   __syncthreads();
-  *sum = aux[0];
+  *out = aux[0];
 }
 
-__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < rows; index += num_threads) {
-    dst_vec_data[index] = 0.0f;
-    for (int k = 0; k < cols; k++) {
-      dst_vec_data[index] += src_mat_data[index * stride + k];
-    }
+__global__ void KernelAdd(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] + in2[i];
   }
 }
 
-__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
-                               int rows, int cols, int stride) {
-  int j = blockIdx.x;
-  int THREADS = blockDim.x;
-  if (j >= cols) {
-    return;
-  }
-
-  __shared__ float aux[CU1DBLOCK];
-  int steps = (rows - 1) / THREADS + 1;
-  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
-  for (int i = 1; i < steps; ++i) {
-    if (threadIdx.x + i * THREADS < rows) {
-      aux[threadIdx.x] +=
-          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
-    }
+__global__ void KernelAdd(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] + x;
   }
+}
 
-  int total_threads = THREADS;
-  __syncthreads();
-  while (total_threads > 1) {
-    int half_point = ((1 + total_threads) >> 1);
-    if (threadIdx.x < half_point) {
-      if (threadIdx.x + half_point < total_threads) {
-        aux[threadIdx.x] += aux[threadIdx.x + half_point];
-      }
-    }
-    __syncthreads();
-    total_threads = ((total_threads + 1) >> 1);
+__global__ void KernelSub(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] - in2[i];
   }
-
-  __syncthreads();
-  dst_vec_data[j] = aux[0];
 }
 
-__global__ void kernel_add_vec_row(const float *src_vec_data,
-                                   const float *src_mat_data,
-                                   float *des_mat_data, int rows, int cols,
-                                   int stride) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int num_threads_x = blockDim.x * gridDim.x;
-  int num_threads_y = blockDim.y * gridDim.y;
-  int index = 0;
-  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
-    index = j * stride + i;
-    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+__global__ void KernelExp(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::exp(in[i]);
   }
 }
-__global__ void kernel_add(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] + src2[index];
+
+__global__ void KernelLog(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::log(in[i]);
   }
 }
 
-__global__ void kernel_sub(const float *src1, const float *src2, float *out,
-                           int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    out[index] = src1[index] - src2[index];
+__global__ void KernelSigmoid(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 1.0f / (1.0f + expf(-in[i]));
   }
 }
-__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::exp(src_data[index]);
+__global__ void KernelSign(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > 0.0f)
+      out[i] = 1.0f;
+    else if (in[i] < 0.0f)
+      out[i] = -1.0f;
+    else
+      out[i] = 0.0f;
   }
 }
 
-__global__ void kernel_log(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::log(src_data[index]);
+__global__ void KernelClamp(const size_t n, const float low, const float high,
+                            const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > high)
+      out[i] = high;
+    else if (in[i] < low)
+      out[i] = low;
+    else
+      out[i] = in[i];
   }
 }
 
-__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelRelu(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = max(in[i], 0.0f);
   }
 }
 
-__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
-                                    int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+__global__ void KernelAbs(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] =  max(in[i], -in[i]);
   }
 }
 
-__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = max(src_data[index], 0.0f);
+__global__ void KernelTanh(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = tanhf(in[i]);
   }
 }
 
-__global__ void kernel_relu_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+__global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = logf(1 + expf(in[i]));
   }
 }
-
-__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = tanhf(src_data[index]);
+__global__ void KernelSquare(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * in[i];
   }
 }
-
-__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
-                                 int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+__global__ void KernelSqrt(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::sqrt(in[i]);
   }
 }
 
-__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = logf(1 + expf(src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in1[i], in2[i]);
   }
 }
 
-__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
-                                     int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in[i], x);
   }
 }
 
-__global__ void kernel_square(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in1, const float *in2,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] * in2[i];
   }
 }
 
-__global__ void kernel_square_grad(const float *src_data, float *des_data,
-                                   int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 2 * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in, const float x,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * x;
   }
 }
 
-__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::sqrt(src_data[index]);
+__global__ void KernelDiv(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] / in2[i];
   }
 }
-
-__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = std::pow(src_data_a[index], src_data_b[index]);
+__global__ void KernelDiv(const size_t n, const float x, const float *in,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x / in[i];
   }
 }
-
-__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * src_data_b[index];
+__global__ static void KernelSet(const size_t n, const float x, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x;
   }
 }
 
-__global__ void kernel_mult(const float *src_data_a, const float x,
-                            float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * x;
+__global__ void KernelThreshold(const size_t n, const float x, const float *in,
+                                float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] < x ? 1.0f : 0.0f;
   }
 }
 
-__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
-                           float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] / src_data_b[index];
+__global__ void KernelGE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
   }
 }
-
-__global__ static void kernel_set_value(float *data, float value, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    data[index] = value;
+__global__ void KernelGT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] > x ? 1.0f : 0.0f;
   }
 }
-
-__global__ void kernel_threshold(const float *src_data, float *des_data,
-                                 float alpha, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+__global__ void KernelLE(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
   }
 }
-void sum(int n, const float *in, float *out) {
-  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-  //  here, we only need one block
-  int num_blocks = 1;
 
-  kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+__global__ void KernelLT(const int num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+  }
 }
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-  int num_blocks = cols;
+// ********************************
+// Functions call kernels
+// ********************************
 
-  kernel_sum_row << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+  KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
 }
 
-void sum_col(int rows, int cols, int stride, const float *in, float *out) {
-  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-  int num_blocks = rows;
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
 
-  kernel_sum_col << <num_blocks, threads_per_block>>>
-      (in, out, rows, cols, stride);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out) {
-  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-  dim3 num_blocks(
-      cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
-      rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
-  kernel_add_vec_row << <num_blocks, threads_per_block>>>
-      (in_row, in_mat, out, rows, cols, stride);
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void add(int n, const float *a, const float *b, float *out) {
-  kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void sub(int n, const float *a, const float *b, float *out) {
-  kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
-void exp(int n, const float *in, float *out) {
-  kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void log(int n, const float *in, float *out) {
-  kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
 }
 
-void sigmoid(int n, const float *in, float *out) {
-  kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+  KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s) {
+  KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
 }
 
-void sigmoid_grad(int n, const float *in, float *out) {
-  kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu(int n, const float *in, float *out) {
-  kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void relu_grad(int n, const float *in, float *out) {
-  kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
 }
 
-void tanh(int n, const float *in, float *out) {
-  kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void tanh_grad(int n, const float *in, float *out) {
-  kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s) {
+  KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
 }
 
-void softplus(int n, const float *in, float *out) {
-  kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
 }
 
-void softplus_grad(int n, const float *in, float *out) {
-  kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square(int n, const float *in, float *out) {
-  kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void square_grad(int n, const float *in, float *out) {
-  kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void sqrt(int n, const float *in, float *out) {
-  kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void pow(int n, const float *a, const float *b, float *out) {
-  kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
 }
 
-void mult(int n, const float *a, const float *b, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+  KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void mult(int n, const float *a, const float x, float *out) {
-  kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void div(int n, const float *a, const float *b, float *out) {
-  kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void set_value(int n, float v, float *out) {
-  kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-void threshold(int n, float alpha, const float *in, float *out) {
-  kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
 }
 
-// follow the consistency guide for math API
-__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
-                          float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = alpha / in[idx];
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
   }
 }
 
-__global__ void KernelGE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
   }
-}
-__global__ void KernelGT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] > x ? 1.0f : 0.0f;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
   }
-}
-__global__ void KernelLE(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
   }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
 }
 
-__global__ void KernelLT(const int num, const float *in, const float x,
-                         float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
   }
 }
 
-__global__ void KernelSet(const size_t num, const float x, float *out) {
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
-       idx += blockDim.x * gridDim.x) {
-    out[idx] = x;
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
   }
 }
 
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
-  KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
 }
-void Div(const size_t num, float alpha, const float *in, float *out,
-         cudaStream_t s) {
-  KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
 }
 
-void GT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
 }
-void GE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * src_data[index];
+  }
 }
-void LT(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
 }
-void LE(const size_t num, const float *in, const float x, float *out,
-        cudaStream_t s) {
-  KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
 }
+*/
+
 
 }  // namespace cuda
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..d8a58a5 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -31,65 +31,66 @@ namespace singa {
 
 // TODO(wangwei) make all function templates.
 namespace cuda {
-void sum(int n, const float *in, float *out);
 
-void sum_row(int rows, int cols, int stride, const float *in, float *out);
-
-void sum_col(int rows, int cols, int stride, const float *in, float *out);
-
-void add_row(int rows, int cols, int stride, const float *in_row,
-             const float *in_mat, float *out);
-
-void add(int n, const float *a, const float *b, float *out);
-
-void sub(int n, const float *a, const float *b, float *out);
-
-void exp(int n, const float *in, float *out);
-
-void log(int n, const float *in, float *out);
-
-void sigmoid(int n, const float *in, float *out);
-
-void sigmoid_grad(int n, const float *in, float *out);
-
-void relu(int n, const float *in, float *out);
-
-void relu_grad(int n, const float *in, float *out);
-
-void tanh(int n, const float *in, float *out);
-
-void tanh_grad(int n, const float *in, float *out);
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus(int n, const float *in, float *out);
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
 
-void softplus_grad(int n, const float *in, float *out);
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
 
-void square(int n, const float *in, float *out);
+void div(const size_t n, const float x, const float *in, float *out,
+         cudaStream_t s);
 
-void square_grad(int n, const float *in, float *out);
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s);
 
-void sqrt(int n, const float *in, float *out);
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
 
-void pow(int n, const float *a, const float *b, float *out);
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float *b, float *out);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void mult(int n, const float *a, const float x, float *out);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void div(int n, const float *a, const float *b, float *out);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
 
-void set_value(int n, float v, float *out);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
 
-void threshold(int n, float alpha, const float *in, float *out);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s);
 
-// follow the consistency guide for math API
-void Div(const size_t num, const float x, const float *in, float *out,
-         cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
-void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index f4e9da2..e62386a 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -219,6 +219,8 @@ GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
+
+
 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
@@ -309,6 +311,18 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
   } while (0)
 
 // =============Element-wise operations====================================
+/// L2 norm, Do not use Nrm2 (name conflict).
+float Tensor::L2() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret;
+      Nrm2<DType, Lang>(this->Size(), this->blob(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->blob()}, {});
+  });
+  return nrm;
+}
 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b5d0ba9..b86e1cb 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -48,41 +48,45 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Blob level math functions.
 
-// =============Element-wise operations====================================
+// **************************************
+// Element-wise functions
+// **************************************
+
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
 void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
-/// out = in + x
+/// out[i] = in[i] + x
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
-/// out = in1 + in2
+/// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
 void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Clamp every element into [low, high]
+/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
 void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
            Blob *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
-/// out = x / in
+/// out[i] = x / in[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const DType x, const Blob *in, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
+/// out[i] = in[i] / x
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
@@ -90,21 +94,21 @@ void Div(const size_t num, const Blob *in, const DType x, Blob *out,
   EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
 }
 
-/// out = in1 / in2
+/// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
 void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
-/// out = in * x
+/// out[i] = in[i] * x
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
-/// out = in2 * in2
+/// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
 void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
                  Context *ctx) {
@@ -146,31 +150,32 @@ void GT(const size_t num, const Blob *in, const DType x, Blob *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
-/// Element-wise operation, do v^x for every v from the in tensor
+/// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
-/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+/// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
 void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
-/// Element-wise operation, out[i]=max(0, in[i])
+/// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
 void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
+/// out[i] = x
 template <typename DType, typename Lang>
 void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
-/// Element-wise operation, out[i]=sigmoid([in[i])
+/// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
 void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
@@ -181,85 +186,47 @@ template <typename DType, typename Lang>
 void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
-/// Element-wise operation, out[i]=sqrt([in[i])
+/// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
 void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
-/// Element-wise operation, out[i]=square([in[i])
+/// out[i]=square(in[i])
 template <typename DType, typename Lang>
 void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
-  LOG(FATAL) << "Square Not Implemented";
+  EltwiseMult<DType, Lang>(num, in, in, out, ctx);
 }
 
-/// out =  in - x
+/// out[i] =  in[i] - x
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
          Context *ctx) {
   Add<DType, Lang>(num, in, -x, out, ctx);
 }
 
-/// out = in1 - in2
+/// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
 void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
+
 /// sum all elements of in into out
 template <typename DType, typename Lang>
 void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
-/// Element-wise operation, out[i]=tanh([in[i])
+/// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
 void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
-// =========== Matrix operations ===========================================
-/// Add the vector v to every column of A as the column of out
-template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddCol Not Implemented";
-}
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
-template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
-            Blob *out, Context *ctx) {
-  LOG(FATAL) << "AddRow Not Implemented";
-}
-/// outer-product.
-/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
-template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
-           Blob *out, Context *ctx) {
-  LOG(FATAL) << "Outer Not Implemented";
-}
-// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "Softmax Not Implemented";
-}
-/// Sum the columns of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-                Context *ctx) {
-  LOG(FATAL) << "SumColumns Not Implemented";
-}
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
-             Context *ctx) {
-  LOG(FATAL) << "SumRows Not Implemented";
-}
-
-// ================Random functions===========================================
+// **************************************
+// Random functions
+// **************************************
 /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
@@ -282,7 +249,10 @@ void Uniform(const size_t num, const float low, const float high, Blob *out,
   LOG(FATAL) << "Uniform Not Implemented";
 }
 
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
 void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
@@ -307,12 +277,19 @@ void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
+/// out = ||in||_2^2, i.e, L2 norm.
+template <typename DType, typename Lang>
+void Nrm2(const size_t num, const Blob *in, float *out, Context *ctx) {
+  LOG(FATAL) << "Nrm2 Not Implemented";
+}
+
 /// out *= x
 template <typename DType, typename Lang>
 void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
+/// inner product of array in1 and in2
 template <typename DType, typename Lang>
 void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
          Context *ctx) {
@@ -346,5 +323,44 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
+// **************************************
+// Matrix functions
+// **************************************
+/*
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+            Blob *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
+/// outer-product.
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
+template <typename DType, typename Lang>
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+           Blob *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
+}
+
+/// Sum the columns of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
+}
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+*/
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2c5c272..0b280a3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -241,7 +241,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = sqrt(inPtr[i]);
   }
 }
-
+/*
 template <>
 void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
                               Context *ctx) {
@@ -251,6 +251,7 @@ void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
     outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
+*/
 
 template <>
 void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -287,101 +288,6 @@ void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
   }
 }
 
-// =========Matrix operations ================================================
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
-    }
-  }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Blob *A, const Blob *v, Blob *out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
-    }
-  }
-}
-template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
-                             const Blob *in2, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t r = 0; r < m; r++) {
-    size_t offset = r * n;
-    for (size_t c = 0; c < n; c++) {
-      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
-    }
-  }
-}
-template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *bPtr = new float[ncol];
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    float denom = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      bPtr[c] = exp(inPtr[offset + c]);
-      denom += bPtr[c];
-    }
-    for (size_t c = 0; c < ncol; c++) {
-      size_t idx = offset + c;
-      outPtr[idx] = bPtr[c] / denom;
-    }
-  }
-  delete bPtr;
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t c = 0; c < ncol; c++) {
-    outPtr[c] = 0.f;
-  }
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[c] += inPtr[offset + c];
-    }
-  }
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Blob *in, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t r = 0; r < nrow; r++) {
-    size_t offset = r * ncol;
-    outPtr[r] = 0.f;
-    for (size_t c = 0; c < ncol; c++) {
-      outPtr[r] += inPtr[offset + c];
-    }
-  }
-}
-
 // ===============Random operations==========================================
 template <>
 void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
@@ -440,18 +346,26 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
 
 #ifdef USE_CBLAS
 template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_isamax(num, inPtr, 1);
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_sasum(num, inPtr, 1);
+}
+
+template <>
 void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
                             Blob *out, Context *ctx) {
   const float *inPtr = static_cast<const float *>(in->data());
   float *outPtr = static_cast<float *>(out->mutable_data());
   cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
 }
-template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
-                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_sscal(num, x, outPtr, 1);
-}
 
 template <>
 void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -461,6 +375,19 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
   *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
 }
 template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cblas_sscal(num, x, outPtr, 1);
+}
+template <>
+void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_snrm2(num, inPtr, 1);
+}
+
+template <>
 void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
                             const float alpha, const Blob *A, const Blob *v,
                             const float beta, Blob *out, Context *ctx) {
@@ -587,6 +514,102 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
 }
 
 #endif  // USE_CBLAS
+
+// =========Matrix operations ================================================
+/*
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Blob *A, const Blob *v, Blob *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
+  }
+}
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+                             const Blob *in2, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
+  }
+}
+template <>
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
+  }
+  delete bPtr;
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
+  }
+}
+
+template <>
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Blob *in, Blob *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
+}
+*/
 }  // namespace singa
 
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f9841a3..e2597d5 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -24,105 +24,336 @@
 #include "./math_kernel.h"
 #include "singa/utils/cuda_utils.h"
 #include "singa/core/common.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "singa/utils/cuda_utils.h"
 
 namespace singa {
-// =================Elementwise operations===================================
+
+/// out[i] = |in[i]|
+template <>
+void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::abs(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in + x
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 + in2
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <>
+void Clamp<float, lang::Cuda>(const size_t num, const float low,
+                              const float high, const Blob* in, Blob* out,
+                              Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 / in2
+template <>
+void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, x, inPtr, outPtr, ctx->stream);
+}
+
+/// out = in * x
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in,
+                                    const float x, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 * in2
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
+                                    const Blob* in2, Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Base is e. out[i]=e^in[i]
+template <>
+void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::exp(num, inPtr, outPtr, ctx->stream);
+}
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::le(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
+template <>
+void Log<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::log(num, inPtr, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                           Blob* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i] = in[i]^x
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+                            Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::add(num, in1Ptr, in2Ptr, outPtr);
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
 
-// follow the consistency guide of math API
+/// Element-wise operation, out[i]=max(0, in[i])
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::Div(num, x, inPtr, outPtr, ctx->stream);
+void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::relu(num, inPtr, outPtr, ctx->stream);
 }
 
+/// out[i] = x
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
-                                    const float x, Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::mult(num, inPtr, x, outPtr);
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                            Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::set(num, x, outPtr, ctx->stream);
 }
+/// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GE(num, inPtr, x, outPtr, ctx->stream);
+void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                                Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
 }
+// out[i] = sign(in[i])
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::GT(num, inPtr, x, outPtr, ctx->stream);
+void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sign(num, inPtr, outPtr, ctx->stream);
 }
+
+/// Element-wise operation, out[i]=sqrt([in[i])
+template <>
+void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=in[i]^2
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::square(num, inPtr, outPtr, ctx->stream);
 }
+/// out = in1 - in2
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
-                           Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::LT(num, inPtr, x, outPtr, ctx->stream);
+void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            Blob* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
 }
+
+/// sum all elements of input into out
 template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cuda::Set(num, x, outPtr, ctx->stream);
+void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::sum(num, inPtr, out, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+/// Element-wise operation, out[i]=tanh([in[i])
 template <>
-void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
-                               Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::square(num, inPtr, outPtr);
+void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::tanh(num, inPtr, outPtr, ctx->stream);
 }
-// TODO(wangwei) optimize using stream
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
 template <>
-void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
-                            Blob *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
+                                  Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
 }
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
 template <>
-void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
-                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  cuda::sum(num, inPtr, out);
+void Uniform<float, lang::Cuda>(const size_t num, const float low,
+                                const float high, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
+  cuda::add(num, outPtr, low, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and delta to DType
+template <>
+void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
+                                 const float std, Blob* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
 }
 
 // =========================Blas operations==================================
+// ref to http://docs.nvidia.com/cuda/cublas
+template <>
+void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;  // cublas index starts from 1
+}
+
+/// return the index of the element with the min value.
+template <>
+void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;
+}
+
+/// out = sum |x| for all x in in
+template <>
+void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
+}
+
+/// out = alpha * in + out
+template <>
+void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
+                             const Blob* in, Blob* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
+}
+
+/// out = \sum_i in1[i] * in2[i]
+template <>
+void Dot<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+                            float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
+}
+template <>
+void Nrm2<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+                             Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const float* inPtr = static_cast<const float*>(in->data());
+  cublasSnrm2(handle, num, inPtr, 1, out);
+}
+template <>
+void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+                              Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
+}
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
 void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
-                             const size_t ncol, const Blob *M, const Blob *v,
-                             Blob *out, Context *ctx) {
+                             const size_t ncol, const Blob* M, const Blob* v,
+                             Blob* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float *MPtr = static_cast<const float *>(M->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float* MPtr = static_cast<const float*>(M->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -133,11 +364,11 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
 }
 template <>
 void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
-                             const float alpha, const Blob *A, const Blob *v,
-                             const float beta, Blob *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+                             const float alpha, const Blob* A, const Blob* v,
+                             const float beta, Blob* out, Context* ctx) {
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   if (!trans)
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
@@ -152,16 +383,16 @@ template <>
 void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
                              const size_t nrowA, const size_t ncolB,
                              const size_t ncolA, const float alpha,
-                             const Blob *A, const Blob *B, const float beta,
-                             Blob *C, Context *ctx) {
+                             const Blob* A, const Blob* B, const float beta,
+                             Blob* C, Context* ctx) {
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *BPtr = static_cast<const float *>(B->data());
-  float *CPtr = static_cast<float *>(C->mutable_data());
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* BPtr = static_cast<const float*>(B->data());
+  float* CPtr = static_cast<float*>(C->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -171,4 +402,3 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
 
 #endif  // USE_CUDA
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-