You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2015/11/16 07:08:53 UTC
[07/19] incubator-singa git commit: SINGA-80 New Blob Level and
Address Level Math Operation Interface
SINGA-80 New Blob Level and Address Level Math Operation Interface
* Move files from blob/ folder into utils folder. May separate some files from utils later.
* Remove functions like relu/softplus/sigmoid for Blobs, since they only have one line of code, which can be written directly when called.
* Compile cpu code successfully.
TODO rebase to latest master and do test.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d3379cba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d3379cba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d3379cba
Branch: refs/heads/master
Commit: d3379cbaff99fce26f9b06301cc0d63a56b062ee
Parents: 4b84dbe
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Nov 9 16:57:33 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Nov 9 17:04:48 2015 +0800
----------------------------------------------------------------------
include/singa/blob/math_addr.h | 206 ------------
include/singa/blob/math_blob.h | 597 ---------------------------------
include/singa/blob/math_kernel.h | 59 ----
include/singa/blob/singa_op.h | 350 -------------------
include/singa/utils/blob.h | 55 +--
include/singa/utils/math_addr.h | 206 ++++++++++++
include/singa/utils/math_blob.h | 487 +++++++++++++++++++++++++++
include/singa/utils/math_kernel.h | 59 ++++
include/singa/utils/singa_op.h | 265 +++++++++++++++
src/blob/math_kernel.cu | 439 ------------------------
src/blob/test.cc | 165 ---------
src/test/test_math.cc | 31 +-
src/utils/blob.cc | 10 +-
src/utils/math_kernel.cu | 439 ++++++++++++++++++++++++
14 files changed, 1512 insertions(+), 1856 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h
deleted file mode 100644
index 25cef07..0000000
--- a/include/singa/blob/math_addr.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_MATH_ADDR_H_
-#define SINGA_BLOB_MATH_ADDR_H_
-extern "C" {
- #include <cblas.h>
-}
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#endif
-#include "singa/blob/singa_op.h"
-#ifdef USE_GPU
-#include "cublas_v2.h"
-#endif
-
-
-namespace singa {
-
-template<typename Dtype>
-void cpu_gemm(const Dtype * A, const Dtype * B,
- const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
- const bool TranA, const bool TranB, Dtype * C) {
- int lda, ldb;
- CBLAS_TRANSPOSE tA, tB;
- lda = TranA ? m : k;
- ldb = TranB ? k : n;
- tA = TranA ? CblasTrans : CblasNoTrans;
- tB = TranB ? CblasTrans : CblasNoTrans;
- cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
- B, ldb, beta, C, n);
-}
-
-// should be very careful:
-// m is the length of B, and n is the length of C , A is a n*m matrix
-template<typename Dtype>
-void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
- const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
- int lda, ldb;
- CBLAS_TRANSPOSE tA, tB;
- lda = TranA ? m : k;
- ldb = TranB ? k : n;
- tA = TranA ? CblasTrans : CblasNoTrans;
- tB = TranB ? CblasTrans : CblasNoTrans;
- cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
- B, ldb, beta, C, n);
-
-}
-
-template<typename Dtype>
-void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
- cblas_saxpy(n, alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
- Dtype sum = 0;
- for (int i = 0 ; i < n ; i++)
- sum += A[i] * B[i];
- return sum;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, Dtype * A) {
- for (int i = 0 ; i < n ; i++) {
- Op::Map(alpha, &A[i]);
- }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
- for (int i = 0 ; i < n ; i++) {
- Op::Map(alpha, A[i], &B[i]);
- }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype * B,
- const Dtype alpha, const Dtype beta, Dtype * C) {
- for (int i = 0 ; i < n ; i++) {
- Op::Map(alpha, beta, A[i], B[i], &C[i]);
- }
-}
-// element-wise generalized operation defined in Op
-
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
- for (int i = 0 ; i < m ; i++) {
- Op::Map(A+i*n, n, B[i]);
- }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
- for (int i = 0 ; i < m ; i++) {
- Op::Map(A[i], n, B+i*n);
- }
-}
-// expand each element in A into a row of B
-
-#ifdef USE_GPU
-template<typename Dtype>
-void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
- const int k, const Dtype alpha, const Dtype beta, const bool TranA,
- const bool TranB, Dtype * C) {
- int lda = TranA ? m : k;
- int ldb = TranB ? k : n;
- int ldc = n;
- cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
- cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
- cublasHandle_t handle;
- cublasCreate(&handle);
- cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
- A, lda, &beta, C, ldc);
- cublasDestroy(handle);
-}
-
-template<typename Dtype>
-void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
- const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
- int lda = n;
- cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
- cublasHandle_t handle;
- cublasCreate(&handle);
- cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
- cublasDestroy(handle);
-}
-
-template<typename Dtype>
-void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
- cublasHandle_t handle;
- cublasCreate(&handle);
- cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
- cublasDestroy(handle);
-}
-
-template<typename Dtype>
-Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) {
- cublasHandle_t handle;
- cublasCreate(&handle);
- Dtype result = 0.0;
- cublasSdot(handle, n, A, 1, B, 1, &result);
- cublasDestroy(handle);
- return result;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
- Op::CudaMap(alpha, A, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
- Op::CudaMap(alpha, A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype * B,
- const Dtype alpha, const Dtype beta, Dtype * C) {
- Op::CudaMap(alpha, beta, A, B, C, n);
-}
-// element-wise generalized operation defined in Op
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
- for (int i = 0 ; i < m ; i++) {
- Op::CudaMap(A+i*n, n, B[i]);
- }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
- for (int i = 0 ; i < m ; i++) {
- Op::CudaMap(A[i], n, B+i*n);
- }
-}
-// expand each element in A into a row of B
-#endif // USE_GPU
-
-} // namespace singa
-#endif // SINGA_BLOB_MATH_ADDR_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h
deleted file mode 100644
index f3147e8..0000000
--- a/include/singa/blob/math_blob.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_MATH_BLOB_H_
-#define SINGA_BLOB_MATH_BLOB_H_
-
-#include <vector>
-#include "singa/utils/blob.h"
-#include "singa/blob/singa_op.h"
-#include "singa/blob/math_addr.h"
-
-
-namespace singa {
-enum XPU {cpu, gpu, any};
-
-/************* BLAS level 1 *****************/
-/**
- * Scale each element of A with alpha, and put the result into B.
- * Bi = alpha*Ai
- * Use blas scale internally.
- */
-template<typename Dtype>
-void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
- CHECK_EQ(A.count(), B->count());
- if (xpu == cpu)
- cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-#ifdef USE_GPU
-#endif
-}
-
-/**
- * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
- */
-template<typename Dtype>
-void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
- CHECK_EQ(A.count(), B.count());
- if (xpu == cpu) {
- cpu_axpy(A.cpu_data(), A.count(),
- alpha, B->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- gpu_axpy(A.gpu_data(), A.count(),
- alpha, B->mutable_gpu_data());
- }
-#endif // USE_GPU
-}
-
-/************* BLAS level 2 *****************/
-/**
- * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
- * Strict shape checking:
- * - dim of A ==2
- * columsn of A(.T) == B.count()
- * - rows of A(.T) == C.count()
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, vector
- * @param[in, out] C, vector
- */
-template<typename Dtype>
-void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
- const Blob<Dtype>& B, Blob<Dtype>* C) {
- CHECK_EQ(A.shape().size(), 2) << "A must be a matrix";
- int a1, a2, m, n;
- a1 = A.transpose() ? A.shape(1) : A.shape(0);
- a2 = A.transpose() ? A.shape(0) : A.shape(1);
- m = B.count();
- n = C->count();
- CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
- CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
-
- bool TranA = A.transpose();
- if (xpu == cpu) {
- cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
- C->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
- C->mutable_gpu_data());
- }
-#endif // USE_GPU
-}
-/**
- * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
- * Loose shape checking:
- * - dim of A >=2
- * - A.count() % B.count() == 0
- * - B.count() == C.count()
- *
- * @param[in] A input matrix
- * @param[in] B input vector
- * @param[out] C output vector
- */
-template <typename Dtype>
-void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
-{
- GEMV(xpu, Dtype(1), Dtype(0), A, B, C);
-}
-
-/************* BLAS level 3 *****************/
-/**
- * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
- *
- * Tranpose is considered for A and B.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, matrix
- * @param[in, out] C, matrix
- */
-template <typename Dtype>
-void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
- const Blob<Dtype> & B, Blob<Dtype> * C) {
- CHECK_EQ(A.shape().size(), 2);
- CHECK_EQ(B.shape().size(), 2);
- CHECK_EQ(C.shape().size(), 2);
- int a1, a2, b1, b2, m, n;
- CHECK(!C->transpose());
- a1 = A.transpose() ? A.shape(1) : A.shape(0);
- a2 = A.transpose() ? A.shape(0) : A.shape(1);
- b1 = B.transpose() ? B.shape(1) : B.shape(0);
- b2 = B.transpose() ? B.shape(0) : B.shape(1);
- m = C->shape(0);
- n = C->shape(1);
- CHECK__EQ(a2, b1);
- CHECK__EQ(a1, m);
- CHECK__EQ(b2, n);
-
- int k = A.transpose() ? A.shape(0) : A.shape(1);
- bool TranA = A.transpose();
- bool TranB = B.transpose();
- if (xpu == cpu) {
- cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
- TranA, TranB, C->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
- TranA, TranB, C->mutable_gpu_data());
- }
-#endif // USE_GPU
-}
-/**
- * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] A input matrix
- * @param[in] B input matrix
- * @param[out] C output matrix
- */
-template <typename Dtype>
-void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
-{
- GEMM(xpu, Dtype(1), Dtype(0), A, B, C);
-}
-
-
-/*********************** Inner and Outer product****************************/
-/**
- * Inner product for two vectors.
- * Loose shape checking, A.count() == B.count.
- *
- * @param[in] A, input vector (shape checking using A.count()).
- * @param[in] B, input vector (shape checking using B.count()).
- * @return inner product value.
- */
-template <typename Dtype>
-Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) {
- Dtype res = 0;
- CHECK_EQ(A.count(), B.count());
- int n = A.count();
- if (xpu == cpu) {
- res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- // gpu part
- res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
- }
-#endif // USE_GPU
- return res;
-}
-
-/**
- * Outer product, C = A ** B, transpose is disabled.
- * Loose shape checking, A.count() * B.count() == C.count()
- *
- * @param[in] A, input vector
- * @param[in] B, input vector
- * @param[out] C, output matrix
- */
-template <typename Dtype>
-void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
- Blob<Dtype> * C) {
- CHECK(!C.transpose()); // do not support C.T now.
-
- int m = A.count();
- int n = B.count();
- CHECK_EQ(C->count(), m * n);
-
- if (xpu == cpu) {
- cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
- false, false, C->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
- false, false, C->mutable_gpu_data());
- }
-#endif // USE_GPU
-}
-/*********************** Element-wise functions ***********************/
-/**
- * Apply the function from Op for each element in A and put the result into B,
- * i.e., Bi = Op(Ai).
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
- if (xpu == cpu) {
- cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
- }
-#endif // SINGA_GPU
-}
-
-/**
- * Apply the function from Op for each element in A and B, and put the result
- * into C, i.e., Ci = Op(Ai, Bi).
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
- Blob<Dtype> * C) {
- CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
- CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
- if (xpu == cpu) {
- cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
- }
-#endif // SINGA_GPU
-}
-
-/**
- * Bi = Op(alpha, Ai)
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) {
- CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
- if (xpu == cpu) {
- cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
-#endif // SINGA_GPU
-}
-/**
- * Ci = Op(alpha, Ai, Bi)
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
- Blob<Dtype>* C) {
- CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
- if (xpu == cpu) {
- cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
- C->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
-#endif // SINGA_GPU
-}
-
-/**
- * Currently use std::copy which has shown better performance than memcpy.
- * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
- * TODO(wangwei) test blas copy vs std::copy.
- *
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Dtype>
-void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) {
- CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
- if (xpu == cpu)
- std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
- else {
- LOG(FATAL) << "Not implemented";
- }
-}
-
-/**
- * C = A + B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
- Blob<Dtype> * C) {
- Copy(A, C);
- AXPY(B, C, 1);
-}
-
-/**
- * C = A - B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
- Blob<Dtype> * C) {
- Copy(xpu, A, C);
- AXPY(xpu, B, C, -1);
-}
-
-/**
- * C = A * B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
- Blob<Dtype> * C) {
- Map<singa::op::Mult>(xpu, A, B, C);
- // TODO(wangwei) use MKL's vector func
-}
-
-/**
- * C = A / B, implemented using
- * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
- Blob<Dtype> * C) {
- Map<singa::op::Div>(xpu, A, B, C);
- // TODO(wangwei) use MKL's vector func
-}
-/*************************1D<-->2D op/transform***************************/
-/**
- * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
- Blob<Dtype> * B) {
- CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A";
- int m = A.count(), n = B->count() / m;
- if (xpu == cpu) {
- Blob<Dtype> one(n);
- one.SetValue(1);
- cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta,
- false, false, B->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- singa_gpu_add_vec_row(B->gpu_data(),
- A.gpu_data(), A.gpu_data(), m, n, n);
- // gpu part
- }
-#endif // USE_GPU
-}
-/**
- * Add each row of B with A, i.e., Bij = Ai + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
- MVAdd(xpu, Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Copy A to each row of B
- * Loose shape checking, B.count() % A.count() == 0,
- * # rows of B = B.count() / A.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- MVAdd(xpu, Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- * Transpose is disabled.
- */
-template<typename Dtype>
-void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
- Blob<Dtype> * B) {
- CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-
- int m = B->count(), n = A.count() / m;
- if (xpu == cpu) {
- Blob<Dtype> one(n);
- one.SetValue(1);
- cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
- false, false, B->mutable_cpu_data());
- }
-#ifdef USE_GPU
- if (xpu == gpu) {
- singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
- // gpu part
- }
-#endif // USE_GPU
-}
-/**
- * Reduce each row of A to an element of B.
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- */
-template<typename Op, typename Dtype>
-void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length";
- int m = B->count(), n = A.count() / m;
- if (xpu == cpu) {
- cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
- }
-#endif // SINGA_GPU
-}
-/**
- * Duplicate each element of A into a row of B.
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Op, typename Dtype>
-void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A";
- int m = A.count(), n = B->count() / m;
- if (xpu == cpu) {
- cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
- }
-#endif // SINGA_GPU
-}
-
-/***********************************************************/
-/**
- * Apply the function from Op for each element in A, Op(Ai).
- * @param A
- */
-template<typename Op, typename Dtype>
-void Map(XPU xpu, Blob<Dtype>* A) {
- if (xpu == cpu) {
- cpu_e_f<Op>(A->count(), A->mutable_cpu_data());
- }
-#ifdef SINGA_GPU
- if (xpu == gpu) {
- // gpu part
- gpu_e_f<Op>(A->count(), A->mutable_gpu_data());
- }
-#endif // SINGA_GPU
-}
-
-/**
- * B = e ^ A
- */
-template<typename Dtype>
-void Exp(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Exp>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = 1.0f / (1.0f + expf(-a));
- */
-template<typename Dtype>
-inline void Sigmoid(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Sigmoid>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = a * ( 1.0f - a );
- */
-inline void SigmoidGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::SigmoidGrad>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = std::max( a, 0)
- */
-inline void Relu(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Relu>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = a > 0 ? 1: 0;
- */
-template<typename Dtype>
-inline void ReluGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::ReluGrad>(xpu, A, B);
-}
-
-/**
- * element-wise operation: b = tanh(a);
- */
-template<typename Dtype>
-inline void Tanh(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Tanh>(xpu, A, B);
-}
-
-/**
- * B = 1- A^2
- */
-template<typename Dtype>
-inline void TanhGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::TanhGrad>(xpu, A, B);
-}
-/**
- * B = log(1+exp(A))
- */
-template<typename Dtype>
-inline void Softplus(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Softplus>(xpu, A, B);
-}
-
-/**
- * B = 1.0f / (1.0f + expf(-A));
- */
-template<typename Dtype>
-inline void SoftplusGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::SoftplusGrad>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void Square(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Square>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void SquareGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Square_grad>(xpu, A, B);
-}
-
-template<typename Dtype>
-inline void Sqrt(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
- Map<singa::op::Sqrt>(xpu, A, B);
-}
-
-/**
- * B = A < alpha ? 1:0;
- */
-template<typename Dtype>
-inline void Threshold(XPU xpu, Dtype alpha, const Blob<Dtype> & A,
- Blob<Dtype> * B) {
- Map<singa::op::Threshold>(xpu, alpha, A, B);
-}
-} // end of namespace singa
-
-#endif // SINGA_BLOB_MATH_BLOB_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h
deleted file mode 100644
index f5d3e34..0000000
--- a/include/singa/blob/math_kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef MATH_KERNEL_H
-#define MATH_KERNEL_H
-
-namespace singa{
-
-extern "C" {
- void singa_gpu_sum_vec(float *data, float *sum , long n);
-
- void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
-
- void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
-
- void singa_gpu_set_value(float *data, float value, long n);
-
- void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_scale_grad(float *data, float alpha, long n);
-
- void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n);
-
- void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
- void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
- void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
- void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
-
-};
-
-}
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h
deleted file mode 100644
index 1131c5d..0000000
--- a/include/singa/blob/singa_op.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_BLOB_SINGA_OP_H_
-#define SINGA_BLOB_SINGA_OP_H_
-
-#ifdef SINGA_GPU
-#include <cuda_runtime.h>
-#endif // SINGA_GPU
-#include <cmath>
-#include <algorithm>
-#ifdef SINGA_GPU
-#include "cublas_v2.h"
-#include "singa/blob/math_kernel.h"
-#endif // SINGA_GPU
-
-namespace singa {
-
-namespace op {
-
-/**
- * b = e^a
- */
-template<Dtype>
-struct Exp {
- inline static void Map(const float & a, float * b) {
- *b = exp(a);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a,
- float * b, int n) {
- singa::singa_gpu_exp(a, b, alpha, n);
- }
-#endif // SINGA_GPU
-};
-/**
- * b = log(a), base is e
- */
-template<Dtype>
-struct Log {
- inline static void Map(const float & a, float *b) {
- *b = log(a);
- }
-}
-
-template<Dtype>
-struct Sigmoid {
- inline static void Map(const float & a, float * b) {
- *b = 1.0f / (1.0f + expf(-a * alpha));
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- float * b, int n) {
- singa::singa_gpu_sigmoid(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-template<Dtype>
-struct SigmoidGrad {
- inline static void Map(const float & a, float * b) {
- *b = a * (1.0f - a);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a, float * b, int n) {
- singa::singa_gpu_sigmoid_grad(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Relu {
- inline static void Map(const float & a, float * b) {
- *b = std::max(a, 0.0f);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a, float * b, int n) {
- singa::singa_gpu_relu(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct ReluGrad {
- inline static void Map(const float & a, float * b) {
- *b = a > 0 ? 1 : 0;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a, float * b, int n) {
- singa::singa_gpu_relu_grad(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Tanh {
- inline static void Map(const float & a, float * b) {
- *b = tanhf(a);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a, float * b, int n) {
- singa::singa_gpu_tanh(a, b, alpha, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct TanhGrad {
- inline static void Map(const float & a, float * b) {
- *b = 1 - a * a;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a, float * b, int n) {
- singa::singa_gpu_tanh_grad(a, b, alpha, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Softplus {
- inline static void Map(const float & a, float * b) {
- *b = logf(1 + expf(a));
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a, float * b, int n) {
- singa::singa_gpu_softplus(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct SoftplusGrad {
- inline static void Map(const float & a, float * b) {
- *b = 1.0f / (1.0f + expf(-a));
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- float * b, int n) {
- singa::singa_gpu_softplus_grad(a, b, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Square {
- inline static void Map(const float & a, float * b) {
- *b = a * a;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- float * b, int n) {
- singa::singa_gpu_square(a, b, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct SquareGrad {
- inline static void Map(const float & a, float * b) {
- *b = 2 * sqrt(a);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- float * b, int n) {
- singa::singa_gpu_square_grad(a, b, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Sqrt {
- inline static void Map(const float & a, float * b) {
- *b = sqrt(a);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- float * b, int n) {
- singa::singa_gpu_sqrt(a, b, n);
- }
-#endif // SINGA_GPU
-};
-
-/*********************************************************************/
-/**
- * c = pow(a, b), i.e., c = a^b
- */
-template<Dtype>
-struct Pow {
- inline static void Map(const float & a, const float &b, float * c) {
- *c = pow(a, b);
- }
-}
-template<Dtype>
-struct Mult {
- inline static void Map(const float & a, const float & b, float * c) {
- *c = a * b;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float* a, const float* b, float* c, int n) {
- singa::singa_gpu_mult(a, b, c, 1, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Div {
- inline static void Map(const float & a, const float & b, float * c) {
- *c = a / b;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a,
- const float * b, float * c, int n) {
- singa::singa_gpu_div(a, b, c, 1, 1, n);
- }
-#endif // SINGA_GPU
-};
-
-
-/*********************************************************************/
-template<Dtype>
-struct Set {
- inline static void Map(float alpha, float * a) {
- *a = alpha;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, float * a, int n) {
- singa::singa_gpu_set_value(a, alpha, n);
- }
-#endif // SINGA_GPU
-};
-
-template<Dtype>
-struct Threshold {
- inline static void Map(float alpha, const float & a, float * b) {
- *b = a < alpha ? 1.0f : 0.0f;
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a,
- float * b, int n) {
- singa::singa_gpu_threshold(a, b, alpha, n);
- }
-#endif // SINGA_GPU
-};
-
-/**********************************/
-struct Expand_Div {
- inline static void Map(const float & a, int n, float * b) {
- for (int i = 0 ; i < n ; i++) {
- b[i] /= a;
- }
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float & a, int n, float * b) {
- singa::singa_gpu_scale(b, b, a, n);
- }
-#endif // SINGA_GPU
-};
-
-struct Repmat {
- inline static void Map(const float & a, int n, float * b) {
- for (int i = 0 ; i < n ; i++) {
- b[i] = a;
- }
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float & a, int n, float * b) {
- singa::singa_gpu_set_value(b, a, n);
- }
-#endif // SINGA_GPU
-};
-
-
-struct Scale {
- inline static void Map(float alpha, const float & a, float * b) {
- *b = alpha * a;
- }
- #ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a,
- float * b, int n) {
- singa::singa_gpu_scale(a, b, alpha, n);
- }
- #endif // SINGA_GPU
-};
-
-struct Scale_grad {
- inline static void Map(float alpha, float * output) {
- *output = alpha;
- }
- #ifdef SINGA_GPU
- inline static void CudaMap(float alpha, float * output, int n) {
- singa::singa_gpu_scale_grad(output, alpha, n);
- }
- #endif // SINGA_GPU
-};
-
-struct ExpGrad {
- inline static void Map(float alpha, const float & a, float * b) {
- // log is the natrual log based on e
- *b = a * log(alpha);
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(float alpha, const float * a,
- float * b, int n) {
- singa::singa_gpu_exp_grad(a, b, alpha, n);
- }
-#endif // SINGA_GPU
-};
-
-struct Sum {
- inline static void Map(const float * a, int n, float * b) {
- *b = 0;
- for (int i = 0 ; i < n ; i++) {
- *b += a[i];
- }
- }
-#ifdef SINGA_GPU
- inline static void CudaMap(const float * a, int n, float * b) {
- float *sum = NULL;
- cudaMalloc(<void**>(&sum), n*sizeof(float));
-
- singa::singa_gpu_sum_vec(a, sum, n);
-
- cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice);
- cudaFree(sum);
- }
-#endif // SINGA_GPU
-};
-
-}; // namespace op
-
-}; // namespace singa
-
-#endif // SINGA_BLOB_SINGA_OP_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
index eecb674..f7c29af 100644
--- a/include/singa/utils/blob.h
+++ b/include/singa/utils/blob.h
@@ -121,10 +121,12 @@ class Blob {
public:
Blob() {}
explicit Blob(const std::vector<int>& shape) { Reshape(shape); }
- explicit Blob(int count) { Reshape(count); }
- explicit Blob(int a, int b) { Reshape(a, b); }
- explicit Blob(int a, int b, int c) { Reshape(a, b, c); }
- explicit Blob(int a, int b, int c, int d) { Reshape(a, b, c, d); }
+ explicit Blob(int dim0) { Reshape(dim0); }
+ explicit Blob(int dim0, int dim1) { Reshape(dim0, dim1); }
+ explicit Blob(int dim0, int dim1, int dim2) { Reshape(dim0, dim1, dim2); }
+ explicit Blob(int dim0, int dim1, int dim2, int dim3) {
+ Reshape(dim0, dim1, dim2, dim3);
+ }
/**
* Change the shape of the blob, re-allocat memory if Blob size() changes.
*
@@ -136,34 +138,43 @@ class Blob {
* Helper for Reshape(const std::vector<int>& shape) with shape.size() = 1.
*
* @see Reshape(const std::vector<int>&).
- * @param[in] count total num of elements.
+ * @param[in] dim0 total num of elements.
*/
- void Reshape(int count);
+ void Reshape(int dim0) {
+ Reshape(std::vector<int>{dim0});
+ }
/**
* Helper for Reshape(const std::vector<int>& shape) with shape.size() = 2.
*
- * @param a the highest dimension size, i.e., a = shape[0]. E.g., a could the
- * batchsize.
- * @param[in] b, b = shape[1], e.g., b could be the length of the feature vector.
+ * @param dim0 the highest dimension size, i.e., dim0 = shape[0]. E.g., dim0
+ * could the batchsize.
+ * @param[in] dim1, dim1 = shape[1], e.g., dim1 could be the length of the
+ * feature vector.
*/
- void Reshape(int a, int b);
+ void Reshape(int dim0, int dim1) {
+ Reshape(std::vector<int>{dim0, dim1});
+ }
/**
* Helper for Reshape(const std::vector<int>& shape) with shape.size() = 3.
*
- * @param[in] a, a = shape[0]
- * @param[in] b, b = shape[1]
- * @param[in] c, c = shape[2]
+ * @param[in] dim0, dim0 = shape[0]
+ * @param[in] dim1, dim1 = shape[1]
+ * @param[in] dim2, dim2 = shape[2]
*/
- void Reshape(int a, int b, int c);
+ void Reshape(int dim0, int dim1, int dim2) {
+ Reshape(std::vector<int>{dim0, dim1, dim2});
+ }
/**
* Helper for Reshape(const std::vector<int>& shape) with shape.size() = 4.
*
- * @param[in] a, a = shape[0]
- * @param[in] b, b = shape[1]
- * @param[in] c, c = shape[2]
- * @param[in] d, d = shape[3]
+ * @param[in] dim0, dim0 = shape[0]
+ * @param[in] dim1, dim1 = shape[1]
+ * @param[in] dim2, dim2 = shape[2]
+ * @param[in] dim3, dim3 = shape[3]
*/
- void Reshape(int a, int b, int c, int d);
+ void Reshape(int dim0, int dim1, int dim2, int dim3) {
+ Reshape(std::vector<int>{dim0, dim1, dim2, dim3});
+ }
/**
* Reshape as the shape of *other* Blob.
* @param[in] other
@@ -274,7 +285,7 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) {
template <typename Dtype>
Blob<Dtype>* Reshape(const Blob<Dtype> & A, int count) {
std::vector<int> tmpshape;
- tmpshape.push_back(dim1);
+ tmpshape.push_back(count);
return Reshape(A, tmpshape);
}
/**
@@ -302,8 +313,8 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2) {
* Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
*/
template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1,
- int dim2, int dim3) {
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2,
+ int dim3) {
std::vector<int> tmpshape;
tmpshape.push_back(dim0);
tmpshape.push_back(dim1);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
new file mode 100644
index 0000000..59f89ea
--- /dev/null
+++ b/include/singa/utils/math_addr.h
@@ -0,0 +1,206 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_MATH_ADDR_H_
+#define SINGA_UTILS_MATH_ADDR_H_
+extern "C" {
+ #include <cblas.h>
+}
+#ifdef USE_GPU
+#include <cuda_runtime.h>
+#endif
+#include "singa/utils/singa_op.h"
+#ifdef USE_GPU
+#include "cublas_v2.h"
+#endif
+
+
+namespace singa {
+
+template<typename Dtype>
+void cpu_gemm(const Dtype * A, const Dtype * B,
+ const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
+ const bool TranA, const bool TranB, Dtype * C) {
+ int lda, ldb;
+ CBLAS_TRANSPOSE tA, tB;
+ lda = TranA ? m : k;
+ ldb = TranB ? k : n;
+ tA = TranA ? CblasTrans : CblasNoTrans;
+ tB = TranB ? CblasTrans : CblasNoTrans;
+ cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
+ B, ldb, beta, C, n);
+}
+
+// should be very careful:
+// m is the length of B, and n is the length of C , A is a n*m matrix
+template<typename Dtype>
+void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
+ const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
+ int lda, ldb;
+ CBLAS_TRANSPOSE tA, tB;
+ lda = TranA ? m : k;
+ ldb = TranB ? k : n;
+ tA = TranA ? CblasTrans : CblasNoTrans;
+ tB = TranB ? CblasTrans : CblasNoTrans;
+ cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
+ B, ldb, beta, C, n);
+
+}
+
+template<typename Dtype>
+void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
+ cblas_saxpy(n, alpha, A, 1, B, 1);
+}
+
+template<typename Dtype>
+Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
+ Dtype sum = 0;
+ for (int i = 0 ; i < n ; i++)
+ sum += A[i] * B[i];
+ return sum;
+}
+
+// element-wise
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype alpha, Dtype * A) {
+ for (int i = 0 ; i < n ; i++) {
+ Op::Map(alpha, &A[i]);
+ }
+}
+
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
+ for (int i = 0 ; i < n ; i++) {
+ Op::Map(alpha, A[i], &B[i]);
+ }
+}
+
+template<typename Op, typename Dtype>
+void cpu_e_f(const int n, const Dtype * A, const Dtype * B,
+ const Dtype alpha, const Dtype beta, Dtype * C) {
+ for (int i = 0 ; i < n ; i++) {
+ Op::Map(alpha, beta, A[i], B[i], &C[i]);
+ }
+}
+// element-wise generalized operation defined in Op
+
+
+// matrix/vector expand/reduce
+
+template<typename Op, typename Dtype>
+void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
+ for (int i = 0 ; i < m ; i++) {
+ Op::Map(A+i*n, n, B[i]);
+ }
+}
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op, typename Dtype>
+void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
+ for (int i = 0 ; i < m ; i++) {
+ Op::Map(A[i], n, B+i*n);
+ }
+}
+// expand each element in A into a row of B
+
+#ifdef USE_GPU
+template<typename Dtype>
+void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
+ const int k, const Dtype alpha, const Dtype beta, const bool TranA,
+ const bool TranB, Dtype * C) {
+ int lda = TranA ? m : k;
+ int ldb = TranB ? k : n;
+ int ldc = n;
+ cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+ cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
+ A, lda, &beta, C, ldc);
+ cublasDestroy(handle);
+}
+
+template<typename Dtype>
+void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
+ const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
+ int lda = n;
+ cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+ cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
+ cublasDestroy(handle);
+}
+
+template<typename Dtype>
+void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+ cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
+ cublasDestroy(handle);
+}
+
+template<typename Dtype>
+Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) {
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+ Dtype result = 0.0;
+ cublasSdot(handle, n, A, 1, B, 1, &result);
+ cublasDestroy(handle);
+ return result;
+}
+
+// element-wise
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
+ Op::CudaMap(alpha, A, n);
+}
+
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
+ Op::CudaMap(alpha, A, B, n);
+}
+
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, const Dtype * B,
+ const Dtype alpha, const Dtype beta, Dtype * C) {
+ Op::CudaMap(alpha, beta, A, B, C, n);
+}
+// element-wise generalized operation defined in Op
+
+// matrix/vector expand/reduce
+
+template<typename Op, typename Dtype>
+void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
+ for (int i = 0 ; i < m ; i++) {
+ Op::CudaMap(A+i*n, n, B[i]);
+ }
+}
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op, typename Dtype>
+void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
+ for (int i = 0 ; i < m ; i++) {
+ Op::CudaMap(A[i], n, B+i*n);
+ }
+}
+// expand each element in A into a row of B
+#endif // USE_GPU
+
+} // namespace singa
+#endif // SINGA_UTILS_MATH_ADDR_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
new file mode 100644
index 0000000..2d53d32
--- /dev/null
+++ b/include/singa/utils/math_blob.h
@@ -0,0 +1,487 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_MATH_BLOB_H_
+#define SINGA_UTILS_MATH_BLOB_H_
+
+#include <vector>
+#include "singa/utils/blob.h"
+#include "singa/utils/singa_op.h"
+#include "singa/utils/math_addr.h"
+
+
+namespace singa {
+enum XPU {cpu, gpu, any};
+
+/************* BLAS level 1 *****************/
+/**
+ * Scale each element of A with alpha, and put the result into B.
+ * Bi = alpha*Ai
+ * Use blas scale internally.
+ */
+template<typename Dtype>
+void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ CHECK_EQ(A.count(), B->count());
+ if (xpu == cpu)
+ cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+#ifdef USE_GPU
+#endif
+}
+
+/**
+ * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
+ */
+template<typename Dtype>
+void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ CHECK_EQ(A.count(), B.count());
+ if (xpu == cpu) {
+ cpu_axpy(A.cpu_data(), A.count(),
+ alpha, B->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ gpu_axpy(A.gpu_data(), A.count(),
+ alpha, B->mutable_gpu_data());
+ }
+#endif // USE_GPU
+}
+
+/************* BLAS level 2 *****************/
+/**
+ * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
+ * Strict shape checking:
+ * - dim of A ==2
+ * columsn of A(.T) == B.count()
+ * - rows of A(.T) == C.count()
+ *
+ * @param[in] alpha
+ * @param[in] beta
+ * @param[in] A, matrix
+ * @param[in] B, vector
+ * @param[in, out] C, vector
+ */
+template<typename Dtype>
+void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+ const Blob<Dtype>& B, Blob<Dtype>* C) {
+ CHECK_EQ(A.shape().size(), 2) << "A must be a matrix";
+ int a1, a2, m, n;
+ a1 = A.transpose() ? A.shape(1) : A.shape(0);
+ a2 = A.transpose() ? A.shape(0) : A.shape(1);
+ m = B.count();
+ n = C->count();
+ CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
+ CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
+
+ bool TranA = A.transpose();
+ if (xpu == cpu) {
+ cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
+ C->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
+ C->mutable_gpu_data());
+ }
+#endif // USE_GPU
+}
+/**
+ * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
+ * Loose shape checking:
+ * - dim of A >=2
+ * - A.count() % B.count() == 0
+ * - B.count() == C.count()
+ *
+ * @param[in] A input matrix
+ * @param[in] B input vector
+ * @param[out] C output vector
+ */
+template <typename Dtype>
+void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
+{
+ GEMV(xpu, Dtype(1), Dtype(0), A, B, C);
+}
+
+/************* BLAS level 3 *****************/
+/**
+ * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
+ *
+ * Tranpose is considered for A and B.
+ * Strict shape checking:
+ * - all are matrix
+ * - shapes match for matrix multiplication
+ *
+ * @param[in] alpha
+ * @param[in] beta
+ * @param[in] A, matrix
+ * @param[in] B, matrix
+ * @param[in, out] C, matrix
+ */
+template <typename Dtype>
+void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+ const Blob<Dtype> & B, Blob<Dtype> * C) {
+ CHECK_EQ(A.shape().size(), 2);
+ CHECK_EQ(B.shape().size(), 2);
+ CHECK_EQ(C.shape().size(), 2);
+ int a1, a2, b1, b2, m, n;
+ CHECK(!C->transpose());
+ a1 = A.transpose() ? A.shape(1) : A.shape(0);
+ a2 = A.transpose() ? A.shape(0) : A.shape(1);
+ b1 = B.transpose() ? B.shape(1) : B.shape(0);
+ b2 = B.transpose() ? B.shape(0) : B.shape(1);
+ m = C->shape(0);
+ n = C->shape(1);
+ CHECK__EQ(a2, b1);
+ CHECK__EQ(a1, m);
+ CHECK__EQ(b2, n);
+
+ int k = A.transpose() ? A.shape(0) : A.shape(1);
+ bool TranA = A.transpose();
+ bool TranB = B.transpose();
+ if (xpu == cpu) {
+ cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
+ TranA, TranB, C->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
+ TranA, TranB, C->mutable_gpu_data());
+ }
+#endif // USE_GPU
+}
+/**
+ * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
+ * Strict shape checking:
+ * - all are matrix
+ * - shapes match for matrix multiplication
+ *
+ * @param[in] A input matrix
+ * @param[in] B input matrix
+ * @param[out] C output matrix
+ */
+template <typename Dtype>
+void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C)
+{
+ GEMM(xpu, Dtype(1), Dtype(0), A, B, C);
+}
+
+
+/*********************** Inner and Outer product****************************/
+/**
+ * Inner product for two vectors.
+ * Loose shape checking, A.count() == B.count.
+ *
+ * @param[in] A, input vector (shape checking using A.count()).
+ * @param[in] B, input vector (shape checking using B.count()).
+ * @return inner product value.
+ */
+template <typename Dtype>
+Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) {
+ Dtype res = 0;
+ CHECK_EQ(A.count(), B.count());
+ int n = A.count();
+ if (xpu == cpu) {
+ res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ // gpu part
+ res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
+ }
+#endif // USE_GPU
+ return res;
+}
+
+/**
+ * Outer product, C = A ** B, transpose is disabled.
+ * Loose shape checking, A.count() * B.count() == C.count()
+ *
+ * @param[in] A, input vector
+ * @param[in] B, input vector
+ * @param[out] C, output matrix
+ */
+template <typename Dtype>
+void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
+ Blob<Dtype> * C) {
+ CHECK(!C.transpose()); // do not support C.T now.
+
+ int m = A.count();
+ int n = B.count();
+ CHECK_EQ(C->count(), m * n);
+
+ if (xpu == cpu) {
+ cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
+ false, false, C->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
+ false, false, C->mutable_gpu_data());
+ }
+#endif // USE_GPU
+}
+/*********************** Element-wise functions ***********************/
+/**
+ * Apply the function from Op for each element in A and put the result into B,
+ * i.e., Bi = Op(Ai).
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+ if (xpu == cpu) {
+ cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
+ }
+#endif // SINGA_GPU
+}
+
+/**
+ * Apply the function from Op for each element in A and B, and put the result
+ * into C, i.e., Ci = Op(Ai, Bi).
+ * Loose shape checking, A, B and C are of the same size.
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+ Blob<Dtype> * C) {
+ CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
+ CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
+ if (xpu == cpu) {
+ cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
+ }
+#endif // SINGA_GPU
+}
+
+/**
+ * Bi = Op(alpha, Ai)
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) {
+ CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+ if (xpu == cpu) {
+ cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+#endif // SINGA_GPU
+}
+/**
+ * Ci = Op(alpha, Ai, Bi)
+ * Loose shape checking, A, B and C are of the same size.
+ */
+template<typename Op, typename Dtype>
+void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
+ Blob<Dtype>* C) {
+ CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+ if (xpu == cpu) {
+ cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
+ C->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+#endif // SINGA_GPU
+}
+
+/**
+ * Currently use std::copy which has shown better performance than memcpy.
+ * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
+ * TODO(wangwei) test blas copy vs std::copy.
+ *
+ * Loose shape checking, A.count() == B.count().
+ */
+template<typename Dtype>
+void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) {
+ CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
+ if (xpu == cpu)
+ std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
+ else {
+ LOG(FATAL) << "Not implemented";
+ }
+}
+
+/**
+ * C = A + B
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+ Blob<Dtype> * C) {
+ Copy(A, C);
+ AXPY(B, C, 1);
+}
+
+/**
+ * C = A - B
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+ Blob<Dtype> * C) {
+ Copy(xpu, A, C);
+ AXPY(xpu, B, C, -1);
+}
+
+/**
+ * C = A * B, implemented using
+ * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ */
+template<typename Dtype>
+void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+ Blob<Dtype> * C) {
+ Map<singa::op::Mult>(xpu, A, B, C);
+ // TODO(wangwei) use MKL's vector func
+}
+
+/**
+ * C = A / B, implemented using
+ * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
+ */
+template<typename Dtype>
+void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+ Blob<Dtype> * C) {
+ Map<singa::op::Div>(xpu, A, B, C);
+ // TODO(wangwei) use MKL's vector func
+}
+/*************************1D<-->2D op/transform***************************/
+/**
+ * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
+ Blob<Dtype> * B) {
+ CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A";
+ int m = A.count(), n = B->count() / m;
+ if (xpu == cpu) {
+ Blob<Dtype> one(n);
+ one.SetValue(1);
+ cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta,
+ false, false, B->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ singa_gpu_add_vec_row(B->gpu_data(),
+ A.gpu_data(), A.gpu_data(), m, n, n);
+ // gpu part
+ }
+#endif // USE_GPU
+}
+/**
+ * Add each row of B with A, i.e., Bij = Ai + Bij
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
+ MVAdd(xpu, Dtype(1), Dtype(1), A, B);
+}
+
+/**
+ * Copy A to each row of B
+ * Loose shape checking, B.count() % A.count() == 0,
+ * # rows of B = B.count() / A.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ MVAdd(xpu, Dtype(1), Dtype(0), A, B);
+}
+
+/**
+ * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
+ * Loose shape checking, A.count() % B.count() == 0.
+ * # rows of A = A.count() / B.count().
+ * Transpose is disabled.
+ */
+template<typename Dtype>
+void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
+ Blob<Dtype> * B) {
+ CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
+
+ int m = B->count(), n = A.count() / m;
+ if (xpu == cpu) {
+ Blob<Dtype> one(n);
+ one.SetValue(1);
+ cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
+ false, false, B->mutable_cpu_data());
+ }
+#ifdef USE_GPU
+ if (xpu == gpu) {
+ singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
+ // gpu part
+ }
+#endif // USE_GPU
+}
+/**
+ * Reduce each row of A to an element of B.
+ * Loose shape checking, A.count() % B.count() == 0.
+ * # rows of A = A.count() / B.count().
+ */
+template<typename Op, typename Dtype>
+void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length";
+ int m = B->count(), n = A.count() / m;
+ if (xpu == cpu) {
+ cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+ }
+#endif // SINGA_GPU
+}
+/**
+ * Duplicate each element of A into a row of B.
+ * Loose shape checking, B.count() % A.count() == 0.
+ * # rows of B = B.count() / A.count().
+ */
+template<typename Op, typename Dtype>
+void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+ CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A";
+ int m = A.count(), n = B->count() / m;
+ if (xpu == cpu) {
+ cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+ }
+#ifdef SINGA_GPU
+ if (xpu == gpu) {
+ // gpu part
+ gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+ }
+#endif // SINGA_GPU
+}
+
+} // end of namespace singa
+
+#endif // SINGA_BLOB_MATH_BLOB_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h
new file mode 100644
index 0000000..f5d3e34
--- /dev/null
+++ b/include/singa/utils/math_kernel.h
@@ -0,0 +1,59 @@
+#ifndef MATH_KERNEL_H
+#define MATH_KERNEL_H
+
+namespace singa{
+
+extern "C" {
+ void singa_gpu_sum_vec(float *data, float *sum , long n);
+
+ void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
+
+ void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
+
+ void singa_gpu_set_value(float *data, float value, long n);
+
+ void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_scale_grad(float *data, float alpha, long n);
+
+ void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n);
+
+ void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+ void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+ void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+ void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n);
+
+};
+
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
new file mode 100644
index 0000000..ff5aba4
--- /dev/null
+++ b/include/singa/utils/singa_op.h
@@ -0,0 +1,265 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_SINGA_OP_H_
+#define SINGA_UTILS_SINGA_OP_H_
+
+#include <cmath>
+#include <algorithm>
+
+#ifdef USE_GPU
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#include "singa/utils/math_kernel.h"
+#endif // USE_GPU
+
+namespace singa {
+
+namespace op {
+
+/**
+ * b = e^a
+ */
+template<Dtype>
+struct Exp {
+ inline static void Map(const float & a, float * b) {
+ *b = exp(a);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, const float * a,
+ float * b, int n) {
+ singa::singa_gpu_exp(a, b, alpha, n);
+ }
+#endif // USE_GPU
+};
+/**
+ * b = log(a), base is e
+ */
+template<Dtype>
+struct Log {
+ inline static void Map(const float & a, float *b) {
+ *b = log(a);
+ }
+}
+
+template<Dtype>
+struct Sigmoid {
+ inline static void Map(const float & a, float * b) {
+ *b = 1.0f / (1.0f + expf(-a * alpha));
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ float * b, int n) {
+ singa::singa_gpu_sigmoid(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+template<Dtype>
+struct SigmoidGrad {
+ inline static void Map(const float & a, float * b) {
+ *b = a * (1.0f - a);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, const float * a, float * b, int n) {
+ singa::singa_gpu_sigmoid_grad(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Relu {
+ inline static void Map(const float & a, float * b) {
+ *b = std::max(a, 0.0f);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a, float * b, int n) {
+ singa::singa_gpu_relu(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct ReluGrad {
+ inline static void Map(const float & a, float * b) {
+ *b = a > 0 ? 1 : 0;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a, float * b, int n) {
+ singa::singa_gpu_relu_grad(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Tanh {
+ inline static void Map(const float & a, float * b) {
+ *b = tanhf(a);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, const float * a, float * b, int n) {
+ singa::singa_gpu_tanh(a, b, alpha, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct TanhGrad {
+ inline static void Map(const float & a, float * b) {
+ *b = 1 - a * a;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, const float * a, float * b, int n) {
+ singa::singa_gpu_tanh_grad(a, b, alpha, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Softplus {
+ inline static void Map(const float & a, float * b) {
+ *b = logf(1 + expf(a));
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a, float * b, int n) {
+ singa::singa_gpu_softplus(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct SoftplusGrad {
+ inline static void Map(const float & a, float * b) {
+ *b = 1.0f / (1.0f + expf(-a));
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ float * b, int n) {
+ singa::singa_gpu_softplus_grad(a, b, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Square {
+ inline static void Map(const float & a, float * b) {
+ *b = a * a;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ float * b, int n) {
+ singa::singa_gpu_square(a, b, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct SquareGrad {
+ inline static void Map(const float & a, float * b) {
+ *b = 2 * sqrt(a);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ float * b, int n) {
+ singa::singa_gpu_square_grad(a, b, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Sqrt {
+ inline static void Map(const float & a, float * b) {
+ *b = sqrt(a);
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ float * b, int n) {
+ singa::singa_gpu_sqrt(a, b, n);
+ }
+#endif // USE_GPU
+};
+
+/*********************************************************************/
+/**
+ * c = pow(a, b), i.e., c = a^b
+ */
+template<Dtype>
+struct Pow {
+ inline static void Map(const float & a, const float &b, float * c) {
+ *c = pow(a, b);
+ }
+}
+template<Dtype>
+struct Mult {
+ inline static void Map(const float & a, const float & b, float * c) {
+ *c = a * b;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float* a, const float* b, float* c, int n) {
+ singa::singa_gpu_mult(a, b, c, 1, 1, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Div {
+ inline static void Map(const float & a, const float & b, float * c) {
+ *c = a / b;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(const float * a,
+ const float * b, float * c, int n) {
+ singa::singa_gpu_div(a, b, c, 1, 1, n);
+ }
+#endif // USE_GPU
+};
+
+
+/*********************************************************************/
+template<Dtype>
+struct Set {
+ inline static void Map(float alpha, float * a) {
+ *a = alpha;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, float * a, int n) {
+ singa::singa_gpu_set_value(a, alpha, n);
+ }
+#endif // USE_GPU
+};
+
+template<Dtype>
+struct Threshold {
+ inline static void Map(float alpha, const float & a, float * b) {
+ *b = a < alpha ? 1.0f : 0.0f;
+ }
+#ifdef USE_GPU
+ inline static void CudaMap(float alpha, const float * a,
+ float * b, int n) {
+ singa::singa_gpu_threshold(a, b, alpha, n);
+ }
+#endif // USE_GPU
+};
+
+}; // namespace op
+
+}; // namespace singa
+
+#endif // SINGA_UTILS_SINGA_OP_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/blob/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu
deleted file mode 100644
index 4c828d5..0000000
--- a/src/blob/math_kernel.cu
+++ /dev/null
@@ -1,439 +0,0 @@
-#include <cmath>
-#include "singa/blob/math_kernel.h"
-
-#define CU2DBLOCK_X 32
-#define CU2DBLOCK_Y 32
-
-#define CU1DBLOCK 1024
-#define CU1DBLOCKF 1024.0
-
-
-//Cuda Kernel Functions
-
-__global__
-void kernel_sum_vec(float *data, float *sum , long n)
-{
- int THREADS = blockDim.x;
-
- __shared__ float aux[CU1DBLOCK];
- int steps = (n - 1) / THREADS + 1;
- aux[threadIdx.x] = data[threadIdx.x];
-
- for(int i=1; i<steps; ++i) {
- if(threadIdx.x+i*THREADS < n) {
- aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
- }
- }
-
- int total_threads = THREADS;
- __syncthreads();
-
- while(total_threads > 1) {
- int half_point = ((1+total_threads) >> 1);
- if (threadIdx.x < half_point) {
- if(threadIdx.x+half_point < total_threads) {
- aux[threadIdx.x] += aux[threadIdx.x + half_point];
- }
- }
- __syncthreads();
- total_threads = ((total_threads+1) >> 1);
- }
-
- __syncthreads();
- *sum = aux[0];
-}
-
-__global__
-void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
-{
- int j = blockIdx.x;
- int THREADS = blockDim.x;
- if(j >= cols) {
- return;
- }
-
- __shared__ float aux[CU1DBLOCK];
- int steps = (rows - 1) / THREADS + 1;
- aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
- for(int i=1; i<steps; ++i) {
- if(threadIdx.x+i*THREADS < rows) {
- aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
- }
- }
-
- int total_threads = THREADS;
- __syncthreads();
- while(total_threads > 1) {
- int half_point = ((1+total_threads) >> 1);
- if (threadIdx.x < half_point) {
- if(threadIdx.x+half_point < total_threads) {
- aux[threadIdx.x] += aux[threadIdx.x + half_point];
- }
- }
- __syncthreads();
- total_threads = ((total_threads+1) >> 1);
- }
-
- __syncthreads();
- dst_vec_data[j] = aux[0];
-}
-
-__global__
-void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride)
-{
- long i = blockIdx.x * blockDim.x + threadIdx.x;
- long j = blockIdx.y * blockDim.y + threadIdx.y;
- long num_threads_x = blockDim.x * gridDim.x;
- long num_threads_y = blockDim.y * gridDim.y;
- long index = 0;
- for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
- index = j * stride + i;
- des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
- }
-}
-
-__global__ static
-void kernel_set_value(float *data, float value, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- data[index] = value;
- }
-}
-
-__global__
-void kernel_scale(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] * alpha;
- }
-}
-
-__global__
-void kernel_scale_grad(float *data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- data[index] = alpha;
- }
-}
-
-__global__
-void kernel_exp(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = pow(-src_data[index],alpha);
- }
-}
-
-__global__
-void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] * log(alpha);
- }
-}
-
-__global__
-void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha);
- }
-}
-
-__global__
-void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha;
- }
-}
-
-__global__
-void kernel_relu(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index];
- }
-}
-
-__global__
-void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha;
- }
-}
-
-
-__global__
-void kernel_tanh(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = tanhf( src_data[index] * alpha );
- }
-}
-
-__global__
-void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] );
- }
-}
-
-__global__
-void kernel_softplus(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = logf(1 + expf(src_data[index]));
- }
-}
-
-__global__
-void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
- }
-}
-
-__global__
-void kernel_square(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] * src_data[index];
- }
-}
-
-__global__
-void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = 2 * sqrt(src_data[index]);
- }
-}
-
-__global__
-void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = sqrt(src_data[index]);
- }
-}
-
-__global__
-void kernel_threshold(const float *src_data, float *des_data, float alpha, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
- }
-}
-
-__global__
-void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data_a[index] + src_data_b[index];
- }
-}
-
-__global__
-void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data_a[index] - src_data_b[index];
- }
-}
-
-__global__
-void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data_a[index] * src_data_b[index];
- }
-}
-
-__global__
-void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- long index = blockIdx.x * blockDim.x + threadIdx.x;
- long num_threads = blockDim.x * gridDim.x;
- for(; index<n; index+=num_threads) {
- des_data[index] = src_data_a[index] / src_data_b[index];
- }
-}
-
-//
-namespace singa{
-
-void singa_gpu_sum_vec(float *data, float *sum , long n)
-{
- long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
- // here, we only need one block
- long num_blocks = 1;
-
- kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
-}
-
-void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
-{
- long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
- long num_blocks = cols;
-
- kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride);
-}
-
-void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
-{
- dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
- dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
- kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride);
-}
-
-void singa_gpu_set_value(float *data, float value, long n)
-{
- kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
-}
-
-void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_scale_grad(float *data, float alpha, long n)
-{
- kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n);
-}
-
-void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n)
-{
- kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
-}
-
-void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
-{
- kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
-}
-
-
-}//namespace singa_gpu