You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2015/11/16 07:08:53 UTC

[07/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

SINGA-80 New Blob Level and Address Level Math Operation Interface

* Move files from blob/ folder into utils folder. May separate some files from utils later.
* Remove functions like relu/softplus/sigmoid for Blobs, since they only have one line of code, which can be written directly when called.
* Compile cpu code successfully.

TODO rebase to latest master and do test.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d3379cba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d3379cba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d3379cba

Branch: refs/heads/master
Commit: d3379cbaff99fce26f9b06301cc0d63a56b062ee
Parents: 4b84dbe
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Nov 9 16:57:33 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Nov 9 17:04:48 2015 +0800

----------------------------------------------------------------------
 include/singa/blob/math_addr.h    | 206 ------------
 include/singa/blob/math_blob.h    | 597 ---------------------------------
 include/singa/blob/math_kernel.h  |  59 ----
 include/singa/blob/singa_op.h     | 350 -------------------
 include/singa/utils/blob.h        |  55 +--
 include/singa/utils/math_addr.h   | 206 ++++++++++++
 include/singa/utils/math_blob.h   | 487 +++++++++++++++++++++++++++
 include/singa/utils/math_kernel.h |  59 ++++
 include/singa/utils/singa_op.h    | 265 +++++++++++++++
 src/blob/math_kernel.cu           | 439 ------------------------
 src/blob/test.cc                  | 165 ---------
 src/test/test_math.cc             |  31 +-
 src/utils/blob.cc                 |  10 +-
 src/utils/math_kernel.cu          | 439 ++++++++++++++++++++++++
 14 files changed, 1512 insertions(+), 1856 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h
deleted file mode 100644
index 25cef07..0000000
--- a/include/singa/blob/math_addr.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_MATH_ADDR_H_
-#define SINGA_BLOB_MATH_ADDR_H_
-extern "C" {
-    #include <cblas.h>
-}
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#endif
-#include "singa/blob/singa_op.h"
-#ifdef USE_GPU
-#include "cublas_v2.h"
-#endif
-
-
-namespace singa {
-
-template<typename Dtype>
-void cpu_gemm(const Dtype * A, const Dtype * B,
-    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
-    const bool TranA, const bool TranB, Dtype * C) {
-  int lda, ldb;
-  CBLAS_TRANSPOSE tA, tB;
-  lda = TranA ? m : k;
-  ldb = TranB ? k : n;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  tB = TranB ? CblasTrans : CblasNoTrans;
-  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
-      B, ldb, beta, C, n);
-}
-
-// should be very careful:
-// m is the length of B, and n is the length of C , A is a n*m matrix
-template<typename Dtype>
-void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
-    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
-  int lda, ldb;
-  CBLAS_TRANSPOSE tA, tB;
-  lda = TranA ? m : k;
-  ldb = TranB ? k : n;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  tB = TranB ? CblasTrans : CblasNoTrans;
-  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
-      B, ldb, beta, C, n);
-
-}
-
-template<typename Dtype>
-void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
-  cblas_saxpy(n, alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
-  Dtype sum = 0;
-  for (int i = 0 ; i < n ; i++)
-    sum += A[i] * B[i];
-  return sum;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, Dtype * A) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, &A[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, A[i], &B[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype * B,
-    const Dtype alpha, const Dtype beta, Dtype * C) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, beta, A[i], B[i], &C[i]);
-  }
-}
-// element-wise generalized operation defined in Op
-
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A[i], n, B+i*n);
-  }
-}
-// expand each element in A into a row of B
-
-#ifdef USE_GPU
-template<typename Dtype>
-void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
-    const int k, const Dtype alpha, const Dtype beta, const bool TranA,
-    const bool TranB, Dtype * C) {
-  int lda = TranA ? m : k;
-  int ldb = TranB ? k : n;
-  int ldc = n;
-  cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
-      A, lda, &beta, C, ldc);
-  cublasDestroy(handle);
-}
-
-template<typename Dtype>
-void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
-    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
-  int lda = n;
-  cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
-  cublasDestroy(handle);
-}
-
-template<typename Dtype>
-void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
-  cublasDestroy(handle);
-}
-
-template<typename Dtype>
-Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  Dtype result = 0.0;
-  cublasSdot(handle, n, A, 1, B, 1, &result);
-  cublasDestroy(handle);
-  return result;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
-  Op::CudaMap(alpha, A, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
-  Op::CudaMap(alpha, A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype * B,
-    const Dtype alpha, const Dtype beta, Dtype * C) {
-  Op::CudaMap(alpha, beta, A, B, C, n);
-}
-// element-wise generalized operation defined in Op
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A[i], n, B+i*n);
-  }
-}
-// expand each element in A into a row of B
-#endif  // USE_GPU
-
-}  // namespace singa
-#endif  // SINGA_BLOB_MATH_ADDR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h
deleted file mode 100644
index f3147e8..0000000
--- a/include/singa/blob/math_blob.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_MATH_BLOB_H_
-#define SINGA_BLOB_MATH_BLOB_H_
-
-#include <vector>
-#include "singa/utils/blob.h"
-#include "singa/blob/singa_op.h"
-#include "singa/blob/math_addr.h"
-
-
-namespace singa {
-enum XPU {cpu, gpu, any};
-
-/************* BLAS level 1 *****************/
-/**
- * Scale each element of A with alpha, and put the result into B.
- * Bi = alpha*Ai
- * Use blas scale internally.
- */
-template<typename Dtype>
-void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count());
-  if (xpu == cpu)
-    cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-#ifdef USE_GPU
-#endif
-}
-
-/**
- * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
- */
-template<typename Dtype>
-void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B.count());
-  if (xpu == cpu) {
-    cpu_axpy(A.cpu_data(), A.count(),
-        alpha, B->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    gpu_axpy(A.gpu_data(), A.count(),
-        alpha, B->mutable_gpu_data());
-  }
-#endif  // USE_GPU
-}
-
-/************* BLAS level 2 *****************/
-/**
- * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
- * Strict shape checking:
- * - dim of A ==2
- *   columsn of A(.T) == B.count()
- * - rows of A(.T) == C.count()
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, vector
- * @param[in, out] C, vector
- */
-template<typename Dtype>
-void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
-    const Blob<Dtype>& B, Blob<Dtype>* C) {
-  CHECK_EQ(A.shape().size(), 2) << "A must be a matrix";
-  int a1, a2, m, n;
-  a1 = A.transpose() ? A.shape(1) : A.shape(0);
-  a2 = A.transpose() ? A.shape(0) : A.shape(1);
-  m = B.count();
-  n = C->count();
-  CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
-  CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
-
-  bool TranA = A.transpose();
-  if (xpu == cpu) {
-    cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
-        C->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
-        C->mutable_gpu_data());
-  }
-#endif  // USE_GPU
-}
-/**
- * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
- * Loose shape checking:
- * - dim of A >=2
- * - A.count() % B.count() == 0
- * - B.count() == C.count()
- *
- * @param[in] A input matrix
- * @param[in] B input vector
- * @param[out] C output vector
- */
-template <typename Dtype>
-void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
-{
-  GEMV(xpu, Dtype(1), Dtype(0), A, B, C);
-}
-
-/************* BLAS level 3 *****************/
-/**
- * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
- *
- * Tranpose is considered for A and B.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, matrix
- * @param[in, out] C, matrix
- */
-template <typename Dtype>
-void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
-    const Blob<Dtype> & B, Blob<Dtype> * C) {
-  CHECK_EQ(A.shape().size(), 2);
-  CHECK_EQ(B.shape().size(), 2);
-  CHECK_EQ(C.shape().size(), 2);
-  int a1, a2, b1, b2, m, n;
-  CHECK(!C->transpose());
-  a1 = A.transpose() ? A.shape(1) : A.shape(0);
-  a2 = A.transpose() ? A.shape(0) : A.shape(1);
-  b1 = B.transpose() ? B.shape(1) : B.shape(0);
-  b2 = B.transpose() ? B.shape(0) : B.shape(1);
-  m = C->shape(0);
-  n = C->shape(1);
-  CHECK__EQ(a2, b1);
-  CHECK__EQ(a1, m);
-  CHECK__EQ(b2, n);
-
-  int k = A.transpose() ? A.shape(0) : A.shape(1);
-  bool TranA = A.transpose();
-  bool TranB = B.transpose();
-  if (xpu == cpu) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
-        TranA, TranB, C->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
-        TranA, TranB, C->mutable_gpu_data());
-  }
-#endif  // USE_GPU
-}
-/**
- * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] A input matrix
- * @param[in] B input matrix
- * @param[out] C output matrix
- */
-template <typename Dtype>
-void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
-{
-  GEMM(xpu, Dtype(1), Dtype(0), A, B, C);
-}
-
-
-/*********************** Inner and Outer product****************************/
-/**
- * Inner product for two vectors.
- * Loose shape checking, A.count() == B.count.
- *
- * @param[in] A, input vector (shape checking using A.count()).
- * @param[in] B, input vector (shape checking using B.count()).
- * @return inner product value.
- */
-template <typename Dtype>
-Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) {
-  Dtype res = 0;
-  CHECK_EQ(A.count(), B.count());
-  int n = A.count();
-  if (xpu == cpu) {
-    res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    // gpu part
-    res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
-  }
-#endif  // USE_GPU
-  return res;
-}
-
-/**
- * Outer product, C = A ** B, transpose is disabled.
- * Loose shape checking, A.count() * B.count() == C.count()
- *
- * @param[in] A, input vector
- * @param[in] B, input vector
- * @param[out] C, output matrix
- */
-template <typename Dtype>
-void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype> * C) {
-  CHECK(!C.transpose());  // do not support C.T now.
-
-  int m = A.count();
-  int n = B.count();
-  CHECK_EQ(C->count(), m * n);
-
-  if (xpu == cpu) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
-        false, false, C->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
-        false, false, C->mutable_gpu_data());
-  }
-#endif  // USE_GPU
-}
-/*********************** Element-wise functions ***********************/
-/**
- * Apply the function from Op for each element in A and put the result into B,
- * i.e., Bi = Op(Ai).
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
-  }
-#endif  // SINGA_GPU
-}
-
-/**
- * Apply the function from Op for each element in A and B, and put the result
- * into C, i.e., Ci = Op(Ai, Bi).
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
-  CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
-  }
-#endif  // SINGA_GPU
-}
-
-/**
- * Bi = Op(alpha, Ai)
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-#endif  // SINGA_GPU
-}
-/**
- * Ci = Op(alpha, Ai, Bi)
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
-        C->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-#endif  // SINGA_GPU
-}
-
-/**
- * Currently use std::copy which has shown better performance than memcpy.
- * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
- * TODO(wangwei) test blas copy vs std::copy.
- *
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Dtype>
-void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu)
-    std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
-  else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-/**
- * C = A + B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(A, C);
-  AXPY(B, C, 1);
-}
-
-/**
- * C = A - B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(xpu, A, C);
-  AXPY(xpu, B, C, -1);
-}
-
-/**
- * C = A * B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Mult>(xpu, A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-
-/**
- * C = A / B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Div>(xpu, A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-/*************************1D<-->2D op/transform***************************/
-/**
- * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
-  CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A";
-  int m = A.count(), n = B->count() / m;
-  if (xpu == cpu) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta,
-        false, false, B->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    singa_gpu_add_vec_row(B->gpu_data(),
-        A.gpu_data(), A.gpu_data(), m, n, n);
-    // gpu part
-  }
-#endif  // USE_GPU
-}
-/**
- * Add each row of B with A, i.e., Bij = Ai + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAdd(xpu, Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Copy A to each row of B
- * Loose shape checking, B.count() % A.count() == 0,
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAdd(xpu, Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-
-  int m = B->count(), n = A.count() / m;
-  if (xpu == cpu) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
-        false, false, B->mutable_cpu_data());
-  }
-#ifdef USE_GPU
-  if (xpu == gpu) {
-    singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
-    // gpu part
-  }
-#endif  // USE_GPU
-}
-/**
- * Reduce each row of A to an element of B.
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- */
-template<typename Op, typename Dtype>
-void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length";
-  int m = B->count(), n = A.count() / m;
-  if (xpu == cpu) {
-    cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-  }
-#endif  // SINGA_GPU
-}
-/**
- * Duplicate each element of A into a row of B.
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Op, typename Dtype>
-void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A";
-  int m = A.count(), n = B->count() / m;
-  if (xpu == cpu) {
-    cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-  }
-#endif  // SINGA_GPU
-}
-
-/***********************************************************/
-/**
- * Apply the function from Op for each element in A, Op(Ai).
- * @param A
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Blob<Dtype>* A) {
-  if (xpu == cpu) {
-    cpu_e_f<Op>(A->count(), A->mutable_cpu_data());
-  }
-#ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
-    gpu_e_f<Op>(A->count(), A->mutable_gpu_data());
-  }
-#endif  // SINGA_GPU
-}
-
-/**
- * B = e ^ A
- */
-template<typename Dtype>
-void Exp(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  Map<singa::op::Exp>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = 1.0f / (1.0f + expf(-a));
- */
-template<typename Dtype>
-inline void Sigmoid(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Sigmoid>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = a * ( 1.0f - a );
- */
-inline void SigmoidGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::SigmoidGrad>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = std::max( a, 0)
- */
-inline void Relu(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Relu>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = a > 0 ? 1: 0;
- */
-template<typename Dtype>
-inline void ReluGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::ReluGrad>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = tanh(a);
- */
-template<typename Dtype>
-inline void Tanh(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Tanh>(xpu, A, B);
-}
-
-/**
- * B = 1- A^2
- */
-template<typename Dtype>
-inline void TanhGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::TanhGrad>(xpu, A, B);
-}
-/**
- * B = log(1+exp(A))
- */
-template<typename Dtype>
-inline void Softplus(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Softplus>(xpu, A, B);
-}
-
-/**
- * B = 1.0f / (1.0f + expf(-A));
- */
-template<typename Dtype>
-inline void SoftplusGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::SoftplusGrad>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void Square(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Square>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void SquareGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Square_grad>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void Sqrt(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-    Map<singa::op::Sqrt>(xpu, A, B);
-}
-
-/**
- * B = A < alpha ? 1:0;
- */
-template<typename Dtype>
-inline void Threshold(XPU xpu, Dtype alpha, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
-  Map<singa::op::Threshold>(xpu, alpha, A, B);
-}
-}  // end of namespace singa
-
-#endif  // SINGA_BLOB_MATH_BLOB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h
deleted file mode 100644
index f5d3e34..0000000
--- a/include/singa/blob/math_kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef MATH_KERNEL_H
-#define MATH_KERNEL_H
-
-namespace singa{
-
-extern "C" {
-	void singa_gpu_sum_vec(float *data, float *sum , long n);
-
-	void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
-
-	void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
-
-	void singa_gpu_set_value(float *data, float value, long n);
-
-	void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_scale_grad(float *data, float alpha, long n);
-
-	void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n);
-
-	void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
-	void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
-	void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
-	void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
-};
-
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h
deleted file mode 100644
index 1131c5d..0000000
--- a/include/singa/blob/singa_op.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_SINGA_OP_H_
-#define SINGA_BLOB_SINGA_OP_H_
-
-#ifdef SINGA_GPU
-#include <cuda_runtime.h>
-#endif  // SINGA_GPU
-#include <cmath>
-#include <algorithm>
-#ifdef SINGA_GPU
-#include "cublas_v2.h"
-#include "singa/blob/math_kernel.h"
-#endif  // SINGA_GPU
-
-namespace singa {
-
-namespace op {
-
-/**
- * b = e^a
- */
-template<Dtype>
-struct Exp {
-  inline static void Map(const float & a, float * b) {
-    *b = exp(a);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a,
-      float * b, int n) {
-    singa::singa_gpu_exp(a, b, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-/**
- * b = log(a), base is e
- */
-template<Dtype>
-struct Log {
-  inline static void Map(const float & a, float *b) {
-    *b = log(a);
-  }
-}
-
-template<Dtype>
-struct Sigmoid {
-  inline static void Map(const float & a, float * b) {
-    *b = 1.0f / (1.0f + expf(-a * alpha));
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      float * b, int n) {
-    singa::singa_gpu_sigmoid(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-template<Dtype>
-struct SigmoidGrad {
-  inline static void Map(const float & a, float * b) {
-    *b = a * (1.0f - a);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
-    singa::singa_gpu_sigmoid_grad(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Relu {
-  inline static void Map(const float & a, float * b) {
-    *b = std::max(a, 0.0f);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a, float * b, int n) {
-    singa::singa_gpu_relu(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct ReluGrad {
-  inline static void Map(const float & a, float * b) {
-    *b = a > 0 ? 1 : 0;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a, float * b, int n) {
-    singa::singa_gpu_relu_grad(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Tanh {
-  inline static void Map(const float & a, float * b) {
-    *b = tanhf(a);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
-    singa::singa_gpu_tanh(a, b, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct TanhGrad {
-  inline static void Map(const float & a, float * b) {
-    *b = 1 - a * a;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
-    singa::singa_gpu_tanh_grad(a, b, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Softplus {
-  inline static void Map(const float & a, float * b) {
-    *b = logf(1 + expf(a));
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a, float * b, int n) {
-    singa::singa_gpu_softplus(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct SoftplusGrad {
-  inline static void Map(const float & a, float * b) {
-    *b = 1.0f / (1.0f + expf(-a));
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      float * b, int n) {
-    singa::singa_gpu_softplus_grad(a, b, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Square {
-  inline static void Map(const float & a, float * b) {
-    *b = a * a;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      float * b, int n) {
-    singa::singa_gpu_square(a, b, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct SquareGrad {
-  inline static void Map(const float & a, float * b) {
-    *b = 2 * sqrt(a);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      float * b, int n) {
-    singa::singa_gpu_square_grad(a, b, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Sqrt {
-  inline static void Map(const float & a, float * b) {
-    *b = sqrt(a);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      float * b, int n) {
-    singa::singa_gpu_sqrt(a, b, n);
-  }
-#endif  // SINGA_GPU
-};
-
-/*********************************************************************/
-/**
- * c = pow(a, b), i.e., c = a^b
- */
-template<Dtype>
-struct Pow {
-  inline static void Map(const float & a, const float &b, float * c) {
-    *c = pow(a, b);
-  }
-}
-template<Dtype>
-struct Mult {
-  inline static void Map(const float & a, const float & b, float * c) {
-    *c =  a * b;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float* a, const float* b, float* c, int n) {
-    singa::singa_gpu_mult(a, b, c, 1, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Div {
-  inline static void Map(const float & a, const float & b, float * c) {
-    *c =  a / b;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a,
-      const float * b, float * c, int n) {
-    singa::singa_gpu_div(a, b, c, 1, 1, n);
-  }
-#endif  // SINGA_GPU
-};
-
-
-/*********************************************************************/
-template<Dtype>
-struct Set {
-  inline static void Map(float alpha, float * a) {
-    *a = alpha;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha, float * a, int n) {
-    singa::singa_gpu_set_value(a, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-
-template<Dtype>
-struct Threshold {
-  inline static void Map(float alpha, const float & a, float * b) {
-    *b =  a < alpha ? 1.0f : 0.0f;
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a,
-      float * b, int n) {
-    singa::singa_gpu_threshold(a, b, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-
-/**********************************/
-struct Expand_Div {
-  inline static void Map(const float & a, int n, float * b) {
-    for (int i = 0 ; i < n ; i++) {
-      b[i] /= a;
-    }
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float & a, int n, float * b) {
-    singa::singa_gpu_scale(b, b, a, n);
-  }
-#endif  // SINGA_GPU
-};
-
-struct Repmat {
-  inline static void Map(const float & a, int n, float * b) {
-    for (int i = 0 ; i < n ; i++) {
-      b[i] = a;
-    }
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float & a, int n, float * b) {
-    singa::singa_gpu_set_value(b, a, n);
-  }
-#endif  // SINGA_GPU
-};
-
-
-struct Scale {
-    inline static void Map(float alpha,  const float & a, float * b) {
-        *b = alpha * a;
-    }
-    #ifdef SINGA_GPU
-    inline static void CudaMap(float alpha,  const float * a,
-    float * b, int n) {
-        singa::singa_gpu_scale(a, b, alpha, n);
-    }
-    #endif  // SINGA_GPU
-};
-
-struct Scale_grad {
-    inline static void Map(float alpha,  float * output) {
-        *output = alpha;
-    }
-    #ifdef SINGA_GPU
-    inline static void CudaMap(float alpha,  float * output, int n) {
-        singa::singa_gpu_scale_grad(output, alpha, n);
-    }
-    #endif  // SINGA_GPU
-};
-
-struct ExpGrad {
-  inline static void Map(float alpha,  const float & a, float * b) {
-    // log is the natrual log based on e
-    *b = a * log(alpha);
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(float alpha,  const float * a,
-      float * b, int n) {
-    singa::singa_gpu_exp_grad(a, b, alpha, n);
-  }
-#endif  // SINGA_GPU
-};
-
-struct Sum {
-  inline static void Map(const float * a, int n, float * b) {
-    *b = 0;
-    for (int i = 0 ; i < n ; i++) {
-      *b += a[i];
-    }
-  }
-#ifdef SINGA_GPU
-  inline static void CudaMap(const float * a, int n, float * b) {
-    float *sum = NULL;
-    cudaMalloc(<void**>(&sum), n*sizeof(float));
-
-    singa::singa_gpu_sum_vec(a, sum, n);
-
-    cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice);
-    cudaFree(sum);
-  }
-#endif  // SINGA_GPU
-};
-
-};  // namespace op
-
-};  // namespace singa
-
-#endif  // SINGA_BLOB_SINGA_OP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index eecb674..f7c29af 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -121,10 +121,12 @@ class Blob {
  public:
   Blob() {}
   explicit Blob(const std::vector<int>& shape) { Reshape(shape); }
-  explicit Blob(int count) { Reshape(count); }
-  explicit Blob(int a, int b) { Reshape(a, b); }
-  explicit Blob(int a, int b, int c) { Reshape(a, b, c); }
-  explicit Blob(int a, int b, int c, int d) { Reshape(a, b, c, d); }
+  explicit Blob(int dim0) { Reshape(dim0); }
+  explicit Blob(int dim0, int dim1) { Reshape(dim0, dim1); }
+  explicit Blob(int dim0, int dim1, int dim2) { Reshape(dim0, dim1, dim2); }
+  explicit Blob(int dim0, int dim1, int dim2, int dim3) {
+    Reshape(dim0, dim1, dim2, dim3);
+  }
   /**
    * Change the shape of the blob, re-allocat memory if Blob size() changes.
    *
@@ -136,34 +138,43 @@ class Blob {
    * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 1.
    *
    * @see Reshape(const std::vector<int>&).
-   * @param[in] count total num of elements.
+   * @param[in] dim0 total num of elements.
    */
-  void Reshape(int count);
+  void Reshape(int dim0) {
+    Reshape(std::vector<int>{dim0});
+  }
   /**
    * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 2.
    *
-   * @param a the highest dimension size, i.e., a = shape[0]. E.g., a could the
-   * batchsize.
-   * @param[in] b, b = shape[1], e.g., b could be the length of the feature vector.
+   * @param dim0 the highest dimension size, i.e., dim0 = shape[0]. E.g., dim0
+   * could the batchsize.
+   * @param[in] dim1, dim1 = shape[1], e.g., dim1 could be the length of the
+   * feature vector.
    */
-  void Reshape(int a, int b);
+  void Reshape(int dim0, int dim1) {
+    Reshape(std::vector<int>{dim0, dim1});
+  }
   /**
    * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 3.
    *
-   * @param[in] a, a = shape[0]
-   * @param[in] b, b = shape[1]
-   * @param[in] c, c = shape[2]
+   * @param[in] dim0, dim0 = shape[0]
+   * @param[in] dim1, dim1 = shape[1]
+   * @param[in] dim2, dim2 = shape[2]
    */
-  void Reshape(int a, int b, int c);
+  void Reshape(int dim0, int dim1, int dim2) {
+    Reshape(std::vector<int>{dim0, dim1, dim2});
+  }
   /**
    * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 4.
    *
-   * @param[in] a, a = shape[0]
-   * @param[in] b, b = shape[1]
-   * @param[in] c, c = shape[2]
-   * @param[in] d, d = shape[3]
+   * @param[in] dim0, dim0 = shape[0]
+   * @param[in] dim1, dim1 = shape[1]
+   * @param[in] dim2, dim2 = shape[2]
+   * @param[in] dim3, dim3 = shape[3]
    */
-  void Reshape(int a, int b, int c, int d);
+  void Reshape(int dim0, int dim1, int dim2, int dim3) {
+    Reshape(std::vector<int>{dim0, dim1, dim2, dim3});
+  }
   /**
    * Reshape as the shape of *other* Blob.
    * @param[in] other
@@ -274,7 +285,7 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) {
 template <typename Dtype>
 Blob<Dtype>* Reshape(const Blob<Dtype> & A, int count) {
   std::vector<int> tmpshape;
-  tmpshape.push_back(dim1);
+  tmpshape.push_back(count);
   return Reshape(A, tmpshape);
 }
 /**
@@ -302,8 +313,8 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2) {
  * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
  */
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1,
-    int dim2, int dim3) {
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2,
+    int dim3) {
   std::vector<int> tmpshape;
   tmpshape.push_back(dim0);
   tmpshape.push_back(dim1);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
new file mode 100644
index 0000000..59f89ea
--- /dev/null
+++ b/include/singa/utils/math_addr.h
@@ -0,0 +1,206 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_MATH_ADDR_H_
+#define SINGA_UTILS_MATH_ADDR_H_
+extern "C" {
+    #include <cblas.h>
+}
+#ifdef USE_GPU
+#include <cuda_runtime.h>
+#endif
+#include "singa/utils/singa_op.h"
+#ifdef USE_GPU
+#include "cublas_v2.h"
+#endif
+
+
+namespace singa {
+
+template<typename Dtype>
+void cpu_gemm(const Dtype * A, const Dtype * B,
+    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
+    const bool TranA, const bool TranB, Dtype * C) {
+  int lda, ldb;
+  CBLAS_TRANSPOSE tA, tB;
+  lda = TranA ? m : k;
+  ldb = TranB ? k : n;
+  tA = TranA ? CblasTrans : CblasNoTrans;
+  tB = TranB ? CblasTrans : CblasNoTrans;
+  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
+      B, ldb, beta, C, n);
+}
+
+// should be very careful:
+// m is the length of B, and n is the length of C , A is a n*m matrix
+template<typename Dtype>
+void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
+    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
+  int lda, ldb;
+  CBLAS_TRANSPOSE tA, tB;
+  lda = TranA ? m : k;
+  ldb = TranB ? k : n;
+  tA = TranA ? CblasTrans : CblasNoTrans;
+  tB = TranB ? CblasTrans : CblasNoTrans;
+  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
+      B, ldb, beta, C, n);
+
+}
+
+template<typename Dtype>
+void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
+  cblas_saxpy(n, alpha, A, 1, B, 1);
+}
+
+template<typename Dtype>
+Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
+  Dtype sum = 0;
+  for (int i = 0 ; i < n ; i++)
+    sum += A[i] * B[i];
+  return sum;
+}
+
+// element-wise
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype alpha, Dtype * A) {
+  for (int i = 0 ; i < n ; i++) {
+    Op::Map(alpha, &A[i]);
+  }
+}
+
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
+  for (int i = 0 ; i < n ; i++) {
+    Op::Map(alpha, A[i], &B[i]);
+  }
+}
+
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype * A, const Dtype * B,
+    const Dtype alpha, const Dtype beta, Dtype * C) {
+  for (int i = 0 ; i < n ; i++) {
+    Op::Map(alpha, beta, A[i], B[i], &C[i]);
+  }
+}
+// element-wise generalized operation defined in Op
+
+
+// matrix/vector expand/reduce
+
+template<typename Op, typename Dtype>
+void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
+  for (int i = 0 ; i < m ; i++) {
+    Op::Map(A+i*n, n, B[i]);
+  }
+}
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op, typename Dtype>
+void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
+  for (int i = 0 ; i < m ; i++) {
+    Op::Map(A[i], n, B+i*n);
+  }
+}
+// expand each element in A into a row of B
+
+#ifdef USE_GPU
+template<typename Dtype>
+void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
+    const int k, const Dtype alpha, const Dtype beta, const bool TranA,
+    const bool TranB, Dtype * C) {
+  int lda = TranA ? m : k;
+  int ldb = TranB ? k : n;
+  int ldc = n;
+  cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+  cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
+      A, lda, &beta, C, ldc);
+  cublasDestroy(handle);
+}
+
+template<typename Dtype>
+void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
+    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
+  int lda = n;
+  cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+  cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
+  cublasDestroy(handle);
+}
+
+template<typename Dtype>
+void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+  cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
+  cublasDestroy(handle);
+}
+
+template<typename Dtype>
+Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) {
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+  Dtype result = 0.0;
+  cublasSdot(handle, n, A, 1, B, 1, &result);
+  cublasDestroy(handle);
+  return result;
+}
+
+// element-wise
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
+  Op::CudaMap(alpha, A, n);
+}
+
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
+  Op::CudaMap(alpha, A, B, n);
+}
+
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, const Dtype * B,
+    const Dtype alpha, const Dtype beta, Dtype * C) {
+  Op::CudaMap(alpha, beta, A, B, C, n);
+}
+// element-wise generalized operation defined in Op
+
+// matrix/vector expand/reduce
+
+template<typename Op, typename Dtype>
+void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
+  for (int i = 0 ; i < m ; i++) {
+    Op::CudaMap(A+i*n, n, B[i]);
+  }
+}
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op, typename Dtype>
+void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
+  for (int i = 0 ; i < m ; i++) {
+    Op::CudaMap(A[i], n, B+i*n);
+  }
+}
+// expand each element in A into a row of B
+#endif  // USE_GPU
+
+}  // namespace singa
+#endif  // SINGA_UTILS_MATH_ADDR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
new file mode 100644
index 0000000..2d53d32
--- /dev/null
+++ b/include/singa/utils/math_blob.h
@@ -0,0 +1,487 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_MATH_BLOB_H_
+#define SINGA_UTILS_MATH_BLOB_H_
+
+#include <vector>
+#include "singa/utils/blob.h"
+#include "singa/utils/singa_op.h"
+#include "singa/utils/math_addr.h"
+
+
+namespace singa {
+enum XPU {cpu, gpu, any};
+
+/************* BLAS level 1 *****************/
+/**
+ * Scale each element of A with alpha, and put the result into B.
+ * Bi = alpha*Ai
+ * Use blas scale internally.
+ */
+template<typename Dtype>
+void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  CHECK_EQ(A.count(), B->count());
+  if (xpu == cpu)
+    cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+#ifdef USE_GPU
+#endif
+}
+
+/**
+ * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
+ */
+template<typename Dtype>
+void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  CHECK_EQ(A.count(), B.count());
+  if (xpu == cpu) {
+    cpu_axpy(A.cpu_data(), A.count(),
+        alpha, B->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    gpu_axpy(A.gpu_data(), A.count(),
+        alpha, B->mutable_gpu_data());
+  }
+#endif  // USE_GPU
+}
+
+/************* BLAS level 2 *****************/
+/**
+ * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
+ * Strict shape checking:
+ * - dim of A ==2
+ *   columsn of A(.T) == B.count()
+ * - rows of A(.T) == C.count()
+ *
+ * @param[in] alpha
+ * @param[in] beta
+ * @param[in] A, matrix
+ * @param[in] B, vector
+ * @param[in, out] C, vector
+ */
+template<typename Dtype>
+void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+    const Blob<Dtype>& B, Blob<Dtype>* C) {
+  CHECK_EQ(A.shape().size(), 2) << "A must be a matrix";
+  int a1, a2, m, n;
+  a1 = A.transpose() ? A.shape(1) : A.shape(0);
+  a2 = A.transpose() ? A.shape(0) : A.shape(1);
+  m = B.count();
+  n = C->count();
+  CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
+  CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
+
+  bool TranA = A.transpose();
+  if (xpu == cpu) {
+    cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
+        C->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
+        C->mutable_gpu_data());
+  }
+#endif  // USE_GPU
+}
+/**
+ * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
+ * Loose shape checking:
+ * - dim of A >=2
+ * - A.count() % B.count() == 0
+ * - B.count() == C.count()
+ *
+ * @param[in] A input matrix
+ * @param[in] B input vector
+ * @param[out] C output vector
+ */
+template <typename Dtype>
+void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
+{
+  GEMV(xpu, Dtype(1), Dtype(0), A, B, C);
+}
+
+/************* BLAS level 3 *****************/
+/**
+ * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
+ *
+ * Tranpose is considered for A and B.
+ * Strict shape checking:
+ * - all are matrix
+ * - shapes match for matrix multiplication
+ *
+ * @param[in] alpha
+ * @param[in] beta
+ * @param[in] A, matrix
+ * @param[in] B, matrix
+ * @param[in, out] C, matrix
+ */
+template <typename Dtype>
+void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+    const Blob<Dtype> & B, Blob<Dtype> * C) {
+  CHECK_EQ(A.shape().size(), 2);
+  CHECK_EQ(B.shape().size(), 2);
+  CHECK_EQ(C.shape().size(), 2);
+  int a1, a2, b1, b2, m, n;
+  CHECK(!C->transpose());
+  a1 = A.transpose() ? A.shape(1) : A.shape(0);
+  a2 = A.transpose() ? A.shape(0) : A.shape(1);
+  b1 = B.transpose() ? B.shape(1) : B.shape(0);
+  b2 = B.transpose() ? B.shape(0) : B.shape(1);
+  m = C->shape(0);
+  n = C->shape(1);
+  CHECK__EQ(a2, b1);
+  CHECK__EQ(a1, m);
+  CHECK__EQ(b2, n);
+
+  int k = A.transpose() ? A.shape(0) : A.shape(1);
+  bool TranA = A.transpose();
+  bool TranB = B.transpose();
+  if (xpu == cpu) {
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
+        TranA, TranB, C->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
+        TranA, TranB, C->mutable_gpu_data());
+  }
+#endif  // USE_GPU
+}
+/**
+ * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
+ * Strict shape checking:
+ * - all are matrix
+ * - shapes match for matrix multiplication
+ *
+ * @param[in] A input matrix
+ * @param[in] B input matrix
+ * @param[out] C output matrix
+ */
+template <typename Dtype>
+void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
+{
+  GEMM(xpu, Dtype(1), Dtype(0), A, B, C);
+}
+
+
+/*********************** Inner and Outer product****************************/
+/**
+ * Inner product for two vectors.
+ * Loose shape checking, A.count() == B.count.
+ *
+ * @param[in] A, input vector (shape checking using A.count()).
+ * @param[in] B, input vector (shape checking using B.count()).
+ * @return inner product value.
+ */
+template <typename Dtype>
+Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) {
+  Dtype res = 0;
+  CHECK_EQ(A.count(), B.count());
+  int n = A.count();
+  if (xpu == cpu) {
+    res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    // gpu part
+    res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
+  }
+#endif  // USE_GPU
+  return res;
+}
+
+/**
+ * Outer product, C = A ** B, transpose is disabled.
+ * Loose shape checking, A.count() * B.count() == C.count()
+ *
+ * @param[in] A, input vector
+ * @param[in] B, input vector
+ * @param[out] C, output matrix
+ */
+template <typename Dtype>
+void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
+    Blob<Dtype> * C) {
+  CHECK(!C.transpose());  // do not support C.T now.
+
+  int m = A.count();
+  int n = B.count();
+  CHECK_EQ(C->count(), m * n);
+
+  if (xpu == cpu) {
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
+        false, false, C->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
+        false, false, C->mutable_gpu_data());
+  }
+#endif  // USE_GPU
+}
+/*********************** Element-wise functions ***********************/
+/**
+ * Apply the function from Op for each element in A and put the result into B,
+ * i.e., Bi = Op(Ai).
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+  if (xpu == cpu) {
+    cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
+  }
+#endif  // SINGA_GPU
+}
+
+/**
+ * Apply the function from Op for each element in A and B, and put the result
+ * into C, i.e., Ci = Op(Ai, Bi).
+ * Loose shape checking, A, B and C are of the same size.
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+    Blob<Dtype> * C) {
+  CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
+  CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
+  if (xpu == cpu) {
+    cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
+  }
+#endif  // SINGA_GPU
+}
+
+/**
+ * Bi = Op(alpha, Ai)
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) {
+  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+  if (xpu == cpu) {
+    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+#endif  // SINGA_GPU
+}
+/**
+ * Ci = Op(alpha, Ai, Bi)
+ * Loose shape checking, A, B and C are of the same size.
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
+    Blob<Dtype>* C) {
+  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+  if (xpu == cpu) {
+    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
+        C->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+#endif  // SINGA_GPU
+}
+
+/**
+ * Currently use std::copy which has shown better performance than memcpy.
+ * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
+ * TODO(wangwei) test blas copy vs std::copy.
+ *
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Dtype>
+void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) {
+  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+  if (xpu == cpu)
+    std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
+  else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+/**
+ * C = A + B
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+    Blob<Dtype> * C) {
+  Copy(A, C);
+  AXPY(B, C, 1);
+}
+
+/**
+ * C = A - B
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+    Blob<Dtype> * C) {
+  Copy(xpu, A, C);
+  AXPY(xpu, B, C, -1);
+}
+
+/**
+ * C = A * B, implemented using
+ * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ */
+template<typename Dtype>
+void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+    Blob<Dtype> * C) {
+  Map<singa::op::Mult>(xpu, A, B, C);
+  // TODO(wangwei) use MKL's vector func
+}
+
+/**
+ * C = A / B, implemented using
+ * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ */
+template<typename Dtype>
+void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+    Blob<Dtype> * C) {
+  Map<singa::op::Div>(xpu, A, B, C);
+  // TODO(wangwei) use MKL's vector func
+}
+/*************************1D<-->2D op/transform***************************/
+/**
+ * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
+    Blob<Dtype> * B) {
+  CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A";
+  int m = A.count(), n = B->count() / m;
+  if (xpu == cpu) {
+    Blob<Dtype> one(n);
+    one.SetValue(1);
+    cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta,
+        false, false, B->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    singa_gpu_add_vec_row(B->gpu_data(),
+        A.gpu_data(), A.gpu_data(), m, n, n);
+    // gpu part
+  }
+#endif  // USE_GPU
+}
+/**
+ * Add each row of B with A, i.e., Bij = Ai + Bij
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
+  MVAdd(xpu, Dtype(1), Dtype(1), A, B);
+}
+
+/**
+ * Copy A to each row of B
+ * Loose shape checking, B.count() % A.count() == 0,
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  MVAdd(xpu, Dtype(1), Dtype(0), A, B);
+}
+
+/**
+ * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
+ * Loose shape checking, A.count() % B.count() == 0.
+ * # rows of A = A.count() / B.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
+    Blob<Dtype> * B) {
+  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
+
+  int m = B->count(), n = A.count() / m;
+  if (xpu == cpu) {
+    Blob<Dtype> one(n);
+    one.SetValue(1);
+    cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
+        false, false, B->mutable_cpu_data());
+  }
+#ifdef USE_GPU
+  if (xpu == gpu) {
+    singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
+    // gpu part
+  }
+#endif  // USE_GPU
+}
+/**
+ * Reduce each row of A to an element of B.
+ * Loose shape checking, A.count() % B.count() == 0.
+ * # rows of A = A.count() / B.count().
+ */
+template<typename Op, typename Dtype>
+void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length";
+  int m = B->count(), n = A.count() / m;
+  if (xpu == cpu) {
+    cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+  }
+#endif  // SINGA_GPU
+}
+/**
+ * Duplicate each element of A into a row of B.
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ */
+template<typename Op, typename Dtype>
+void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+  CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A";
+  int m = A.count(), n = B->count() / m;
+  if (xpu == cpu) {
+    cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+  }
+#ifdef SINGA_GPU
+  if (xpu == gpu) {
+    // gpu part
+    gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+  }
+#endif  // SINGA_GPU
+}
+
+}  // end of namespace singa
+
+#endif  // SINGA_BLOB_MATH_BLOB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h
new file mode 100644
index 0000000..f5d3e34
--- /dev/null
+++ b/include/singa/utils/math_kernel.h
@@ -0,0 +1,59 @@
+#ifndef MATH_KERNEL_H
+#define MATH_KERNEL_H
+
+namespace singa{
+
+extern "C" {
+	void singa_gpu_sum_vec(float *data, float *sum , long n);
+
+	void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
+
+	void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
+
+	void singa_gpu_set_value(float *data, float value, long n);
+
+	void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_scale_grad(float *data, float alpha, long n);
+
+	void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n);
+
+	void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+	void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+	void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+	void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+};
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
new file mode 100644
index 0000000..ff5aba4
--- /dev/null
+++ b/include/singa/utils/singa_op.h
@@ -0,0 +1,265 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_SINGA_OP_H_
+#define SINGA_UTILS_SINGA_OP_H_
+
+#include <cmath>
+#include <algorithm>
+
+#ifdef USE_GPU
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include "singa/utils/math_kernel.h"
+#endif  // USE_GPU
+
+namespace singa {
+
+namespace op {
+
+/**
+ * b = e^a
+ */
+template<Dtype>
+struct Exp {
+  inline static void Map(const float & a, float * b) {
+    *b = exp(a);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha,  const float * a,
+      float * b, int n) {
+    singa::singa_gpu_exp(a, b, alpha, n);
+  }
+#endif  // USE_GPU
+};
+/**
+ * b = log(a), base is e
+ */
+template<Dtype>
+struct Log {
+  inline static void Map(const float & a, float *b) {
+    *b = log(a);
+  }
+}
+
+template<Dtype>
+struct Sigmoid {
+  inline static void Map(const float & a, float * b) {
+    *b = 1.0f / (1.0f + expf(-a * alpha));
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      float * b, int n) {
+    singa::singa_gpu_sigmoid(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+template<Dtype>
+struct SigmoidGrad {
+  inline static void Map(const float & a, float * b) {
+    *b = a * (1.0f - a);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
+    singa::singa_gpu_sigmoid_grad(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Relu {
+  inline static void Map(const float & a, float * b) {
+    *b = std::max(a, 0.0f);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a, float * b, int n) {
+    singa::singa_gpu_relu(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct ReluGrad {
+  inline static void Map(const float & a, float * b) {
+    *b = a > 0 ? 1 : 0;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a, float * b, int n) {
+    singa::singa_gpu_relu_grad(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Tanh {
+  inline static void Map(const float & a, float * b) {
+    *b = tanhf(a);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
+    singa::singa_gpu_tanh(a, b, alpha, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct TanhGrad {
+  inline static void Map(const float & a, float * b) {
+    *b = 1 - a * a;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha,  const float * a, float * b, int n) {
+    singa::singa_gpu_tanh_grad(a, b, alpha, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Softplus {
+  inline static void Map(const float & a, float * b) {
+    *b = logf(1 + expf(a));
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a, float * b, int n) {
+    singa::singa_gpu_softplus(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct SoftplusGrad {
+  inline static void Map(const float & a, float * b) {
+    *b = 1.0f / (1.0f + expf(-a));
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      float * b, int n) {
+    singa::singa_gpu_softplus_grad(a, b, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Square {
+  inline static void Map(const float & a, float * b) {
+    *b = a * a;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      float * b, int n) {
+    singa::singa_gpu_square(a, b, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct SquareGrad {
+  inline static void Map(const float & a, float * b) {
+    *b = 2 * sqrt(a);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      float * b, int n) {
+    singa::singa_gpu_square_grad(a, b, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Sqrt {
+  inline static void Map(const float & a, float * b) {
+    *b = sqrt(a);
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      float * b, int n) {
+    singa::singa_gpu_sqrt(a, b, n);
+  }
+#endif  // USE_GPU
+};
+
+/*********************************************************************/
+/**
+ * c = pow(a, b), i.e., c = a^b
+ */
+template<Dtype>
+struct Pow {
+  inline static void Map(const float & a, const float &b, float * c) {
+    *c = pow(a, b);
+  }
+}
+template<Dtype>
+struct Mult {
+  inline static void Map(const float & a, const float & b, float * c) {
+    *c =  a * b;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float* a, const float* b, float* c, int n) {
+    singa::singa_gpu_mult(a, b, c, 1, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Div {
+  inline static void Map(const float & a, const float & b, float * c) {
+    *c =  a / b;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const float * a,
+      const float * b, float * c, int n) {
+    singa::singa_gpu_div(a, b, c, 1, 1, n);
+  }
+#endif  // USE_GPU
+};
+
+
+/*********************************************************************/
+template<Dtype>
+struct Set {
+  inline static void Map(float alpha, float * a) {
+    *a = alpha;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha, float * a, int n) {
+    singa::singa_gpu_set_value(a, alpha, n);
+  }
+#endif  // USE_GPU
+};
+
+template<Dtype>
+struct Threshold {
+  inline static void Map(float alpha, const float & a, float * b) {
+    *b =  a < alpha ? 1.0f : 0.0f;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(float alpha,  const float * a,
+      float * b, int n) {
+    singa::singa_gpu_threshold(a, b, alpha, n);
+  }
+#endif  // USE_GPU
+};
+
+};  // namespace op
+
+};  // namespace singa
+
+#endif  // SINGA_UTILS_SINGA_OP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/blob/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu
deleted file mode 100644
index 4c828d5..0000000
--- a/src/blob/math_kernel.cu
+++ /dev/null
@@ -1,439 +0,0 @@
-#include <cmath>
-#include "singa/blob/math_kernel.h"
-
-#define CU2DBLOCK_X 32
-#define CU2DBLOCK_Y 32
-
-#define CU1DBLOCK 1024 
-#define CU1DBLOCKF 1024.0 
-
-
-//Cuda Kernel Functions
-
-__global__
-void kernel_sum_vec(float *data, float *sum , long n)
-{
-	int THREADS = blockDim.x;
-	
-	__shared__ float aux[CU1DBLOCK];
-	int steps = (n - 1) / THREADS + 1;
-	aux[threadIdx.x] = data[threadIdx.x];
-
-	for(int i=1; i<steps; ++i) {
-		if(threadIdx.x+i*THREADS < n) {
-			aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
-		}   
-	}
-
-	int total_threads = THREADS;
-	__syncthreads();
-
-	while(total_threads > 1) {
-		int half_point = ((1+total_threads) >> 1); 
-		if (threadIdx.x < half_point) {
-			if(threadIdx.x+half_point < total_threads) {
-				aux[threadIdx.x] += aux[threadIdx.x + half_point];
-			}
-		}
-		__syncthreads();
-		total_threads = ((total_threads+1) >> 1);
-	}
-
-	__syncthreads();
-	*sum = aux[0];
-}
-
-__global__
-void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
-{
-	    int j = blockIdx.x;
-		int THREADS = blockDim.x;
-		if(j >= cols) {
-			return;
-		}
-
-		__shared__ float aux[CU1DBLOCK];
-		int steps = (rows - 1) / THREADS + 1;
-		aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
-		for(int i=1; i<steps; ++i) {
-			if(threadIdx.x+i*THREADS < rows) {
-				aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
-			}
-		}
-
-		int total_threads = THREADS;
-		__syncthreads();
-		while(total_threads > 1) {
-			int half_point = ((1+total_threads) >> 1);
-			if (threadIdx.x < half_point) {
-				if(threadIdx.x+half_point < total_threads) {
-					aux[threadIdx.x] += aux[threadIdx.x + half_point];															            
-				}
-			}
-			__syncthreads();
-			total_threads = ((total_threads+1) >> 1);
-		}
-
-		__syncthreads();
-		dst_vec_data[j] = aux[0];
-}
-
-__global__ 
-void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride)
-{
-	long i = blockIdx.x * blockDim.x + threadIdx.x;
-	long j = blockIdx.y * blockDim.y + threadIdx.y;
-	long num_threads_x = blockDim.x * gridDim.x;
-	long num_threads_y = blockDim.y * gridDim.y;
-	long index = 0;
-	for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
-		index = j * stride + i;
-		des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
-	}
-}
-
-__global__ static
-void kernel_set_value(float *data, float value, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		data[index] = value;
-	}  
-}
-
-__global__
-void kernel_scale(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] * alpha;
-	}  
-}
-
-__global__
-void kernel_scale_grad(float *data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		data[index] = alpha;
-	}  
-}
-
-__global__
-void kernel_exp(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = pow(-src_data[index],alpha);
-	}  
-}
-
-__global__
-void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] * log(alpha);
-	}  
-}
-
-__global__
-void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha);
-	}  
-}
-
-__global__
-void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha;
-	}  
-}
-
-__global__
-void kernel_relu(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index];
-	}  
-}
-
-__global__
-void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha;
-	}  
-}
-
-
-__global__
-void kernel_tanh(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = tanhf( src_data[index] * alpha );
-	}  
-}
-
-__global__
-void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] );
-	}  
-}
-
-__global__
-void kernel_softplus(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = logf(1 + expf(src_data[index]));
-	}  
-}
-
-__global__
-void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
-	}  
-}
-
-__global__
-void kernel_square(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] * src_data[index];
-	}  
-}
-
-__global__
-void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = 2 * sqrt(src_data[index]);
-	}  
-}
-
-__global__
-void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = sqrt(src_data[index]);
-	}  
-}
-
-__global__
-void kernel_threshold(const float *src_data, float *des_data, float alpha, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
-	}  
-}
-
-__global__
-void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data_a[index] + src_data_b[index];
-	}  
-}
-
-__global__
-void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data_a[index] - src_data_b[index];
-	}  
-}
-
-__global__
-void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data_a[index] * src_data_b[index];
-	}  
-}
-
-__global__
-void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	long index = blockIdx.x * blockDim.x + threadIdx.x;
-	long num_threads = blockDim.x * gridDim.x;
-	for(; index<n; index+=num_threads) {
-		des_data[index] = src_data_a[index] / src_data_b[index];
-	}  
-}
-
-//
-namespace singa{
-
-void singa_gpu_sum_vec(float *data, float *sum , long n)
-{
-	long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-	// here, we only need one block
-	long num_blocks = 1;
-
-	kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
-}
-
-void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
-{
-	long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-	long num_blocks = cols;
-
-	kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride);
-}
-
-void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
-{
-	dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-	dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
-	kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride);
-}
-
-void singa_gpu_set_value(float *data, float value, long n)
-{
-	kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
-}
-
-void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_scale_grad(float *data, float alpha, long n)
-{
-	kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n);
-}
-
-void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n)
-{
-	kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
-	kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-
-}//namespace singa_gpu