You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2015/11/16 07:08:52 UTC
[06/19] incubator-singa git commit: SINGA-80 New Blob Level and
Address Level Math Operation Interface
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/blob/test.cc
----------------------------------------------------------------------
diff --git a/src/blob/test.cc b/src/blob/test.cc
deleted file mode 100644
index d13ed5e..0000000
--- a/src/blob/test.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-#include <iostream>
-
-#include "singa_op.h"
-#include "math_addr.h"
-
-using namespace std;
-
-void test_gemm1()
-{
- float A[3][2] = {};
- float B[3][2] = {};
- float C[2][2] = {};
- for(int i = 0; i < 3; i++)
- for(int j = 0; j < 2; j++)
- {
- A[i][j] = i+j;
- B[i][j] = i+j - i*j;
- }
- cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]);
- float D[2][2] = {};
- for(int i = 0; i < 2; i++)
- for(int j = 0; j < 2; j++)
- {
- D[i][j] = 0;
- for(int k = 0; k < 3; k++)
- D[i][j] += A[k][i]*B[k][j];
- }
- for(int i = 0; i < 2; i++)
- for(int j = 0; j < 2; j++)
- {
- cout<<C[i][j] - D[i][j]<<endl;
- }
-}
-
-
-void test_gemm2()
-{
- float A[2][3] = {};
- float B[3][2] = {};
- float C[2][2] = {};
- for(int i = 0; i < 3; i++)
- for(int j = 0; j < 2; j++)
- {
- A[j][i] = i-j;
- B[i][j] = i+j + i*j;
- }
- cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, false, false, C[0]);
- float D[2][2] = {};
- for(int i = 0; i < 2; i++)
- for(int j = 0; j < 2; j++)
- {
- D[i][j] = 0;
- for(int k = 0; k < 3; k++)
- D[i][j] += A[i][k]*B[k][j];
- }
- for(int i = 0; i < 2; i++)
- for(int j = 0; j < 2; j++)
- {
- cout<<C[i][j] - D[i][j]<<endl;
- }
-}
-
-
-void test_gemv()
-{
- float A[4][3] = {};
- float B[4]= {};
- float C[3] = {};
- float D[3] = {};
- for(int i = 0; i < 4; i++)
- {
- for(int j = 0; j < 3; j++)
- {
- A[j][i] = i-j + i*j;
- }
- }
- for(int i = 0; i < 4; i++)B[i] = i;
- for(int i = 0; i < 3; i++)C[i] = 10;
- cpu_gemv(A[0], B, 4, 3, 1, 1, true, C);
- for(int i = 0; i < 3; i++)
- for(int j = 0; j < 4; j++)
- {
- D[i] += A[j][i]*B[j];
- }
- for(int i = 0; i < 3; i++)cout<<C[i] - D[i] - 10<<endl;
-}
-
-void test_axpy()
-{
- float A[4][3] = {};
- float C[4][3] = {};
- float B[3][4] = {};
- float D[3][4] = {};
- for(int i = 0; i < 4; i++)
- {
- for(int j = 0; j < 3; j++)
- {
- A[i][j] = i-j + i*j;
- B[j][i] = i-j + i*j;
- C[i][j] = A[i][j];
- D[j][i] = B[j][i];
- }
- }
- cpu_axpy(A[0], 12, 2, B[0]);
- for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
- for(int i = 0; i < 3; i++)
- {
- for(int j = 0; j < 4; j++)
- {
- cout<<B[i][j] - D[i][j]<<endl;
- }
- }
-}
-
-void test_eop()
-{
- float A[10] = {};
- float B[10] = {};
- float C[10] = {};
- float D[10] = {};
- float O[10] = {};
- for(int i = 0; i < 10; i++)
- {
- A[i] = i;
- B[i] = -i;
- C[i] = i;
- }
- cpu_e_f<op::Set>(5, 15, O);
- for(int i = 0; i < 5; i++)cout<<O[i] - 15<<endl;
- for(int i = 5; i < 10; i++)cout<<O[i]<<endl;
- cpu_e_f<op::Scale>(10, C, 2, C);
- for(int i = 0; i < 10; i++)cout<<C[i] - 2* i<<endl;
- cpu_e_f<op::Add>(10, A, B, 0, 0, O);
- for(int i = 0; i < 10; i++)cout<<O[i]<<endl;
-}
-
-void test_exrd()
-{
- float A[3][10] = {};
- float B[3] = {};
- for(int i = 0; i < 3; i++)
- for(int j = 0; j < 10; j++)
- {
- A[i][j] = (i + 1)*j;
- }
- cpu_reduce_f<op::Sum>(A[0], 3, 10, B);
- for(int i = 0; i < 3; i++) B[i] -= 45*(i+1);
- for(int i = 0; i < 3; i++)cout<<B[i]<<endl;
- cpu_expand_f<op::Repmat>(B, 3, 10, A[0]);
- cpu_reduce_f<op::Sum>(A[0], 3, 10, B);
- for(int i = 0; i < 3; i++)cout<<B[i]<<endl;
-}
-
-int main()
-{
- test_gemm1() ;
- test_gemm2();
- test_gemv();
- test_axpy();
- test_eop();
- test_exrd();
- return 0;
-}
-
-
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index a8a9490..8043168 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -1,8 +1,7 @@
#include "gtest/gtest.h"
-#include "singa/blob/math_addr.h"
-#include "singa/blob/math_blob.h"
-#include "singa/blob/math_kernel.h"
-#include "singa/blob/singa_op.h"
+#include "singa/utils/math_addr.h"
+#include "singa/utils/math_kernel.h"
+#include "singa/utils/singa_op.h"
#include <cuda_runtime.h>
#include "cublas_v2.h"
@@ -37,10 +36,10 @@ TEST(MathTest, TestGemmCPU) {
}
TEST(MathTest, TestGemvCPU) {
- float A[4][3] = {};
- float B[4]= {};
- float C[3] = {};
- float D[3] = {};
+ float A[4][3] = {};
+ float B[4]= {};
+ float C[3] = {};
+ float D[3] = {};
for(int i = 0; i < 4; i++)
{
@@ -51,8 +50,8 @@ TEST(MathTest, TestGemvCPU) {
}
for(int i = 0; i < 4; i++)B[i] = i;
- for(int i = 0; i < 3; i++)C[i] = 10;
- cpu_gemv(A[0], B, 4, 3, 1, 1, true, C);
+ for(int i = 0; i < 3; i++)C[i] = 10;
+ cpu_gemv(A[0], B, 4, 3, 1, 1, true, C);
for(int i = 0; i < 3; i++)
{
@@ -69,9 +68,9 @@ TEST(MathTest, TestGemvCPU) {
TEST(MathTest, TestAxpyCPU) {
- float A[4][3] = {};
- float C[4][3] = {};
- float B[3][4] = {};
+ float A[4][3] = {};
+ float C[4][3] = {};
+ float B[3][4] = {};
float D[3][4] = {};
for(int i = 0; i < 4; i++)
@@ -113,7 +112,7 @@ TEST(MathTest, TestEopCPU) {
A[i] = i;
B[i] = -i;
C[i] = i;
-
+
}
cpu_e_f<singa_op::Set>(5, 15, O);
@@ -336,7 +335,7 @@ TEST(MathTest, TestSingaSumColGPU) {
A[i][j]=i+j;
}
}
-
+
for(int i = 0; i < 4; i++)
{
B[i]=0.0f;
@@ -462,7 +461,7 @@ TEST(MathTest, TestEopGPU) {
B[i] = -i;
C[i] = i;
O[i] = 0.0f;
-
+
}
float* A_gpu=NULL;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/utils/blob.cc
----------------------------------------------------------------------
diff --git a/src/utils/blob.cc b/src/utils/blob.cc
index f720fae..cd164e7 100644
--- a/src/utils/blob.cc
+++ b/src/utils/blob.cc
@@ -7,9 +7,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
-*
+*
* http://www.apache.org/licenses/LICENSE-2.0
-*
+*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -277,6 +277,12 @@ void Blob<Dtype>::ToProto(singa::BlobProto* proto) const {
}
template <typename Dtype>
+void Blob<Dtype>::SetValue(Dtype v) {
+ Dtype* ptr = mutable_cpu_data();
+ for (int i =0; i < count(); i++)
+ ptr[i] = v;
+}
+template <typename Dtype>
void Blob<Dtype>::ShareData(const Blob& other) {
CHECK_EQ(count_, other.count());
data_ = other.data_;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/utils/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu
new file mode 100644
index 0000000..203f261
--- /dev/null
+++ b/src/utils/math_kernel.cu
@@ -0,0 +1,439 @@
+#include <cmath>
+#include "singa/utils/math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024
+#define CU1DBLOCKF 1024.0
+
+
+//Cuda Kernel Functions
+
+__global__
+void kernel_sum_vec(float *data, float *sum , long n)
+{
+ int THREADS = blockDim.x;
+
+ __shared__ float aux[CU1DBLOCK];
+ int steps = (n - 1) / THREADS + 1;
+ aux[threadIdx.x] = data[threadIdx.x];
+
+ for(int i=1; i<steps; ++i) {
+ if(threadIdx.x+i*THREADS < n) {
+ aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
+ }
+ }
+
+ int total_threads = THREADS;
+ __syncthreads();
+
+ while(total_threads > 1) {
+ int half_point = ((1+total_threads) >> 1);
+ if (threadIdx.x < half_point) {
+ if(threadIdx.x+half_point < total_threads) {
+ aux[threadIdx.x] += aux[threadIdx.x + half_point];
+ }
+ }
+ __syncthreads();
+ total_threads = ((total_threads+1) >> 1);
+ }
+
+ __syncthreads();
+ *sum = aux[0];
+}
+
+__global__
+void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+ int j = blockIdx.x;
+ int THREADS = blockDim.x;
+ if(j >= cols) {
+ return;
+ }
+
+ __shared__ float aux[CU1DBLOCK];
+ int steps = (rows - 1) / THREADS + 1;
+ aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
+ for(int i=1; i<steps; ++i) {
+ if(threadIdx.x+i*THREADS < rows) {
+ aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
+ }
+ }
+
+ int total_threads = THREADS;
+ __syncthreads();
+ while(total_threads > 1) {
+ int half_point = ((1+total_threads) >> 1);
+ if (threadIdx.x < half_point) {
+ if(threadIdx.x+half_point < total_threads) {
+ aux[threadIdx.x] += aux[threadIdx.x + half_point];
+ }
+ }
+ __syncthreads();
+ total_threads = ((total_threads+1) >> 1);
+ }
+
+ __syncthreads();
+ dst_vec_data[j] = aux[0];
+}
+
+__global__
+void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride)
+{
+ long i = blockIdx.x * blockDim.x + threadIdx.x;
+ long j = blockIdx.y * blockDim.y + threadIdx.y;
+ long num_threads_x = blockDim.x * gridDim.x;
+ long num_threads_y = blockDim.y * gridDim.y;
+ long index = 0;
+ for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
+ index = j * stride + i;
+ des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+ }
+}
+
+__global__ static
+void kernel_set_value(float *data, float value, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ data[index] = value;
+ }
+}
+
+__global__
+void kernel_scale(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] * alpha;
+ }
+}
+
+__global__
+void kernel_scale_grad(float *data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ data[index] = alpha;
+ }
+}
+
+__global__
+void kernel_exp(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = pow(-src_data[index],alpha);
+ }
+}
+
+__global__
+void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] * log(alpha);
+ }
+}
+
+__global__
+void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha);
+ }
+}
+
+__global__
+void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha;
+ }
+}
+
+__global__
+void kernel_relu(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index];
+ }
+}
+
+__global__
+void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha;
+ }
+}
+
+
+__global__
+void kernel_tanh(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = tanhf( src_data[index] * alpha );
+ }
+}
+
+__global__
+void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] );
+ }
+}
+
+__global__
+void kernel_softplus(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = logf(1 + expf(src_data[index]));
+ }
+}
+
+__global__
+void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+ }
+}
+
+__global__
+void kernel_square(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] * src_data[index];
+ }
+}
+
+__global__
+void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = 2 * sqrt(src_data[index]);
+ }
+}
+
+__global__
+void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = sqrt(src_data[index]);
+ }
+}
+
+__global__
+void kernel_threshold(const float *src_data, float *des_data, float alpha, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+ }
+}
+
+__global__
+void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data_a[index] + src_data_b[index];
+ }
+}
+
+__global__
+void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data_a[index] - src_data_b[index];
+ }
+}
+
+__global__
+void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data_a[index] * src_data_b[index];
+ }
+}
+
+__global__
+void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ long index = blockIdx.x * blockDim.x + threadIdx.x;
+ long num_threads = blockDim.x * gridDim.x;
+ for(; index<n; index+=num_threads) {
+ des_data[index] = src_data_a[index] / src_data_b[index];
+ }
+}
+
+//
+namespace singa{
+
+void singa_gpu_sum_vec(float *data, float *sum , long n)
+{
+ long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+ // here, we only need one block
+ long num_blocks = 1;
+
+ kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
+}
+
+void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+ long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+ long num_blocks = cols;
+
+ kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride);
+}
+
+void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
+{
+ dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+ dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
+ kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride);
+}
+
+void singa_gpu_set_value(float *data, float value, long n)
+{
+ kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
+}
+
+void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_scale_grad(float *data, float alpha, long n)
+{
+ kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n);
+}
+
+void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n)
+{
+ kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n);
+}
+
+void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
+}
+
+void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
+}
+
+void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
+}
+
+void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n)
+{
+ kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n);
+}
+
+
+}//namespace singa_gpu