You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2015/11/16 07:08:49 UTC
[03/19] incubator-singa git commit: SINGA-80 New Blob Level and
Address Level Math Operation Interface
SINGA-80 New Blob Level and Address Level Math Operation Interface
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c13e0370
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c13e0370
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c13e0370
Branch: refs/heads/master
Commit: c13e0370906371ca001e755268c51e656dfde0fa
Parents: bbb7dbc
Author: seaok <se...@gmail.com>
Authored: Fri Oct 30 10:02:22 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Nov 9 17:04:48 2015 +0800
----------------------------------------------------------------------
Makefile.gpu | 153 ++++++++++++++
include/singa/blob/math_kernel.h | 14 ++
src/blob/math_addr.cc | 64 ++++++
src/blob/math_blob.cc | 148 +++++++------
src/blob/math_kernel.cu | 78 +++++++
src/test/test_math.cc | 385 ++++++++++++++++++++++++++++++++++
6 files changed, 775 insertions(+), 67 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
new file mode 100644
index 0000000..9c9051c
--- /dev/null
+++ b/Makefile.gpu
@@ -0,0 +1,153 @@
+#/**
+# * Copyright 2015 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements. See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership. The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+###################User Config Varaibles #############################
+# third-party library installation folder
+HOME_DIR := /usr
+
+CUDA_DIR := /usr/local/cuda
+
+#CUDA_DIR :=
+
+# Lib folder for system and external libs. You may need to change it.
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib $(CUDA_DIR)/lib64
+# Header folder for system and external libs. You may need to change it.
+INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include
+# g++ location, should support c++11, tested with 4.8.1
+CXX := g++
+CUCXX := nvcc
+
+######################Setting Varialbes#######################################
+LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
+
+ifneq ($(CUDA_DIR),)
+ LIBRARIES := $(LIBRARIES) cublas cudart curand
+endif
+
+LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
+ $(foreach library, $(LIBRARIES), -l$(library))
+# Folder to store compiled files
+BUILD_DIR := .libs
+MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+ZK_FLAGS :=-DTHREADED -fpermissive
+CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
+ $(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\
+ -funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+CUCXXFLAGS := $(MSHADOW_FLAGS) -std=c++11 -G $(CUDA_ARCH) \
+ $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+
+#Add device compile option
+ifeq ($(CUDA_DIR),)
+ MSHADOW_FLAGS := $(MSHADOW_FLAGS) -DCPU_ONLY
+ CXXFLAGS := $(CXXFLAGS) -DCPU_ONLY
+endif
+
+# find user defined .proto file, and then compute the corresponding .h, .cc
+# files, which cannot be found by shell find, because they haven't been
+# generated currently
+PROTOS := $(shell find src/proto/ -name "*.proto")
+PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
+PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
+PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
+
+# each singa src file will generate a .o file
+SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o -path "src/utils/tool.cc" \) \
+ -prune -o \( -name "*.cc" -type f \) -print )
+SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
+ $(PROTO_OBJS) )
+-include $(SINGA_OBJS:%.o=%.P)
+
+TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
+TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
+-include $(TEST_OBJS:%.o=%.P)
+
+TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu")
+TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o)))
+-include $(TEST_CUDA_OBJS:%.o=%.P)
+
+SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu")
+SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o)))
+-include $(SINGA_CUDA_OBJS:%.o=%.P)
+
+GTEST_SRC := include/gtest/gtest-all.cc
+GTEST_HDR := include/gtest/gtest.h
+GTEST_LIB := $(BUILD_DIR)/libgtest.a
+
+OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
+CUOBJS := $(sort $(SINGA_CUDA_OBJS) $(TEST_CUDA_OBJS) )
+
+########################Compilation Section###################################
+.PHONY: singa test
+
+singa: $(PROTO_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
+ $(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS)
+ $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
+ @echo
+ $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/utils/tool.cc -o singatool $(CXXFLAGS) $(LDFLAGS)
+ @echo
+
+loader: proto $(LOADER_OBJS)
+ $(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
+ @echo
+
+test: proto $(GTEST_LIB) $(TEST_OBJS) $(TEST_CUDA_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
+ $(CXX) $(TEST_OBJS) $(TEST_CUDA_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \
+ $(SINGA_OBJS) $(SINGA_CUDA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS)
+ @echo
+
+$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
+ $(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
+ ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
+
+# compile all files
+$(OBJS):$(BUILD_DIR)/%.o : %.cc
+ @mkdir -p $(dir $@)
+ $(CXX) $< $(CXXFLAGS) -MMD -c -o $@
+ cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+ -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
+ rm -f $*.d
+
+$(CUOBJS):$(BUILD_DIR)/%.o : %.cu
+ @mkdir -p $(dir $@)
+ $(CUCXX) $< -c -o $@ $(CUCXXFLAGS)
+ cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+ -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
+ rm -f $*.d
+
+proto: $(PROTO_OBJS)
+
+$(PROTO_SRCS): $(PROTOS)
+ protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
+ mkdir -p include/proto/
+ cp src/proto/*.pb.h include/proto/
+ mkdir -p tool/pb2/
+ touch tool/pb2/__init__.py
+ protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)
+ @echo
+
+clean:
+ rm -rf *.a *.so
+ rm -rf include/proto/*
+ rm -rf src/proto/*.pb.h src/proto/*.pb.cc
+ rm -rf tool/pb2/*
+ rm -rf $(BUILD_DIR)
+ @echo
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/include/singa/blob/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h
new file mode 100644
index 0000000..9aaf4c2
--- /dev/null
+++ b/include/singa/blob/math_kernel.h
@@ -0,0 +1,14 @@
+#ifndef MATH_KERNEL_H
+#define MATH_KERNEL_H
+
+namespace singa{
+
+extern "C" {
+ void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
+
+ void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
+};
+
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_addr.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc
index f28fdcb..799a749 100644
--- a/src/blob/math_addr.cc
+++ b/src/blob/math_addr.cc
@@ -3,6 +3,9 @@ extern "C"
#include <cblas.h>
}
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
#include "singa/blob/math_addr.h"
#include "singa/blob/singa_op.h"
@@ -47,5 +50,66 @@ float cpu_dot(const float * A, const float * B, const int n)
return sum;
}
+//Trick: swap A and B
+//
+void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C)
+{
+ int lda = TranA ? m : k;
+ int ldb = TranB ? k : n;
+ int ldc = n;
+
+ cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+
+ cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+ cublasDestroy(handle);
+}
+
+void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C)
+{
+ int lda = n ;
+ cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+
+ cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 1);
+
+ cublasDestroy(handle);
+
+}
+
+
+void gpu_axpy(const float * A, const int n, const float alpha, float * B)
+{
+
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+
+ cublasSaxpy(handle,n,&alpha,A,1,B,1);
+
+ cublasDestroy(handle);
+
+}
+
+
+float gpu_dot(const float * A, const float * B, const int n)
+{
+ cublasHandle_t handle;
+ cublasCreate(&handle);
+
+ float result=0.0;
+
+ cublasSdot(handle,n,A,1,B,1,&result);
+
+ cublasDestroy(handle);
+
+ return result;
+
+}
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_blob.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc
index ff81667..9421367 100644
--- a/src/blob/math_blob.cc
+++ b/src/blob/math_blob.cc
@@ -1,4 +1,5 @@
#include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
namespace singa {
@@ -18,24 +19,26 @@ int get_size(const std::vector<int>& shape)
void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha, float beta)
{
- if(xpu == cpu)
+ if(check_shape_mmm(A, B, *C))
{
- if(check_shape_mmm(A, B, *C))
- {
- int m = C->shape().at(0);
+ int m = C->shape().at(0);
int n = C->shape().at(1);
int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
bool TranA = A.isTranspose();
bool TranB = B.isTranspose();
- cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data());
- }
- else{
- // report errors here
- }
+
+ if(xpu == cpu)
+ {
+ cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data());
+ }
+ if(xpu == gpu)
+ {
+ //gpu part
+ gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
+ }
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
}
//C = alpha*A*B+beta*C, A, B and C are matrix
@@ -50,22 +53,24 @@ void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C)
{
- if(xpu == cpu)
+ if(check_shape_mvv(A, B, *C))
{
- if(check_shape_mvv(A, B, *C))
+ int m = B.shape().at(0);
+ int n = C->shape().at(0);
+ bool TranA = A.isTranspose();
+
+ if(xpu == cpu)
{
- int m = B.shape().at(0);
- int n = C->shape().at(0);
- bool TranA = A.isTranspose();
cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, C->mutable_cpu_data());
}
- else{
- // report errors here
+ if(xpu == gpu)
+ {
+ //gpu part
+ gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, C->mutable_gpu_data());
}
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
}
@@ -74,21 +79,23 @@ void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C)
{
- if(xpu == cpu)
+ if(check_shape_vvm(A, B, *C))
{
- if(check_shape_vvm(A, B, *C))
+ int m = C->shape().at(0);
+ int n = C->shape().at(1);
+
+ if(xpu == cpu)
{
- int m = C->shape().at(0);
- int n = C->shape().at(1);
cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false, C->mutable_cpu_data());
}
- else{
- // report errors here
+ if(xpu == gpu)
+ {
+ //gpu part
+ gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
}
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
}
// C is matrix,A and B are vector
@@ -97,20 +104,21 @@ void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
{
float res = 0;
- if(xpu == cpu)
+ if(check_shape_equal(A, B, B))
{
- if(check_shape_equal(A, B, B))
- {
- int n = get_size(A.shape());
+ int n = get_size(A.shape());
+ if(xpu == cpu)
+ {
res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
}
- else{
- // report errors here
+ if(xpu == gpu)
+ {
+ //gpu part
+ res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
}
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
return res;
}
@@ -118,19 +126,20 @@ float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
{
- if(xpu == cpu)
+ if(check_shape_equal(A, *B, *B))
{
- if(check_shape_equal(A, *B, *B))
+
+ if(xpu == cpu)
{
cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, B->mutable_cpu_data());
}
- else{
- // report errors here
+ if(xpu == gpu)
+ {
+ gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, B->mutable_gpu_data());
}
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
}
// element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size
@@ -143,47 +152,52 @@ inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B)
void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta)
{
- if(xpu == cpu)
+ if(check_shape_mv(*B, A))
{
- if(check_shape_mv(*B, A))
+ int m = get_size(A.shape());
+ int n = get_size(B->shape()) / m;
+
+ if(xpu == cpu)
{
- int m = get_size(A.shape());
- int n = get_size(B->shape()) / m;
const float * univ = cpu_uni_vec(n);
cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, false, false, B->mutable_cpu_data());
delete univ;
}
- else{
- // report errors here
- }
+
+ if(xpu == gpu)
+ {
+ singa_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n);
+ //gpu part
+ }
+ }
+ else{
+ // report errors here
}
- if(xpu == gpu)
- {
- //gpu part
- }
}
// A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
// will use gemm. faster than general expand_f
void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta)
{
- if(xpu == cpu)
+ if(check_shape_mv(A, *B))
{
- if(check_shape_mv(A, *B))
+ int m = get_size(B->shape());
+ int n = get_size(A.shape()) / m;
+
+ if(xpu == cpu)
{
- int m = get_size(B->shape());
- int n = get_size(A.shape()) / m;
const float * univ = cpu_uni_vec(n);
cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, false, false, B->mutable_cpu_data());
delete univ;
}
- else{
- // report errors here
+ if(xpu == gpu)
+ {
+ singa_sum_col(A.gpu_data(),B->gpu_data(),m,n,n);
+ //gpu part
}
}
- if(xpu == gpu)
- {
- //gpu part
+ else{
+ // report errors here
}
}
// B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu
new file mode 100644
index 0000000..6b2a709
--- /dev/null
+++ b/src/blob/math_kernel.cu
@@ -0,0 +1,78 @@
+#include "singa/blob/math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024
+#define CU1DBLOCKF 1024.0
+
+
+//Cuda Kernel Functions
+__global__
+void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+ int j = blockIdx.x;
+ int THREADS = blockDim.x;
+ if(j >= cols) {
+ return;
+ }
+
+ __shared__ float aux[CU1DBLOCK];
+ int steps = (rows - 1) / THREADS + 1;
+ aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
+ for(int i=1; i<steps; ++i) {
+ if(threadIdx.x+i*THREADS < rows) {
+ aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
+ }
+ }
+
+ int total_threads = THREADS;
+ __syncthreads();
+ while(total_threads > 1) {
+ int half_point = ((1+total_threads) >> 1);
+ if (threadIdx.x < half_point) {
+ if(threadIdx.x+half_point < total_threads) {
+ aux[threadIdx.x] += aux[threadIdx.x + half_point];
+ }
+ }
+ __syncthreads();
+ total_threads = ((total_threads+1) >> 1);
+ }
+
+ __syncthreads();
+ dst_vec_data[j] = aux[0];
+}
+
+__global__
+void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* des_mat_data,long rows, long cols, long stride)
+{
+ long i = blockIdx.x * blockDim.x + threadIdx.x;
+ long j = blockIdx.y * blockDim.y + threadIdx.y;
+ long num_threads_x = blockDim.x * gridDim.x;
+ long num_threads_y = blockDim.y * gridDim.y;
+ long index = 0;
+ for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
+ index = j * stride + i;
+ des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+ }
+}
+
+//
+namespace singa{
+
+void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+ long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+ long num_blocks = cols;
+
+ kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride);
+}
+
+void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
+{
+ dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+ dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
+ kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride);
+}
+
+}//namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
new file mode 100644
index 0000000..3856d1d
--- /dev/null
+++ b/src/test/test_math.cc
@@ -0,0 +1,385 @@
+#include "gtest/gtest.h"
+#include "singa/blob/math_addr.h"
+#include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
+#include "singa/blob/singa_op.h"
+
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
+using namespace singa;
+using namespace std;
+
+TEST(MathTest, TestGemmCPU) {
+ float A[3][2] = {};
+ float B[3][2] = {};
+ float C[2][2] = {};
+ for(int i = 0; i < 3; i++)
+ for(int j = 0; j < 2; j++)
+ {
+ A[i][j] = i+j;
+ B[i][j] = i+j - i*j;
+ }
+ cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]);
+ float D[2][2] = {};
+ for(int i = 0; i < 2; i++)
+ for(int j = 0; j < 2; j++)
+ {
+ D[i][j] = 0;
+ for(int k = 0; k < 3; k++)
+ D[i][j] += A[k][i]*B[k][j];
+ }
+ for(int i = 0; i < 2; i++)
+ for(int j = 0; j < 2; j++)
+ {
+ ASSERT_EQ(C[i][j], D[i][j]);
+ }
+}
+
+TEST(MathTest, TestGemvCPU) {
+ float A[4][3] = {};
+ float B[4]= {};
+ float C[3] = {};
+ float D[3] = {};
+
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 3; j++)
+ {
+ A[j][i] = i-j + i*j;
+ }
+ }
+
+ for(int i = 0; i < 4; i++)B[i] = i;
+ for(int i = 0; i < 3; i++)C[i] = 10;
+ cpu_gemv(A[0], B, 4, 3, 1, 1, true, C);
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ D[i] += A[j][i]*B[j];
+ }
+ }
+ for(int i = 0; i < 3; i++)
+ {
+ ASSERT_EQ(C[i], D[i]+10);
+ }
+}
+
+
+TEST(MathTest, TestAxpyCPU) {
+ float A[4][3] = {};
+ float C[4][3] = {};
+ float B[3][4] = {};
+ float D[3][4] = {};
+
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 3; j++)
+ {
+ A[i][j] = i-j + i*j;
+ B[j][i] = i-j + i*j;
+ C[i][j] = A[i][j];
+ D[j][i] = B[j][i];
+ }
+ }
+
+ cpu_axpy(A[0], 12, 2, B[0]);
+ for(int i = 0; i < 12; i++)
+ {
+ D[0][i] += 2*C[0][i];
+ }
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ ASSERT_EQ(B[i][j],D[i][j]);
+ }
+ }
+}
+
+
+TEST(MathTest, TestGemmGPU) {
+ float A[3][2] = {};
+ float B[3][2] = {};
+ float C[2][2] = {};
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 2; j++)
+ {
+ A[i][j] = i+j;
+ B[i][j] = i+j - i*j;
+ }
+ }
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+ float* C_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 3*2*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 3*2*sizeof(float));
+ cudaMalloc((void**)&C_gpu, 2*2*sizeof(float));
+
+ cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice);
+
+ gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu);
+
+ cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost);
+
+ float D[2][2] = {};
+ for(int i = 0; i < 2; i++)
+ {
+ for(int j = 0; j < 2; j++)
+ {
+ D[i][j] = 0;
+ for(int k = 0; k < 3; k++)
+ {
+ D[i][j] += A[k][i]*B[k][j];
+ }
+ }
+ }
+
+ for(int i = 0; i < 2; i++)
+ {
+ for(int j = 0; j < 2; j++)
+ {
+ ASSERT_EQ(C[i][j],D[i][j]);
+ }
+ }
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+ cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestGemvGPU) {
+ float A[4][3] = {};
+ float B[4]= {};
+ float C[3] = {};
+ float D[3] = {};
+
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 3; j++)
+ {
+ A[i][j] = i-j + i*j;
+ }
+ }
+
+ for(int i = 0; i < 4; i++)B[i] = i;
+ for(int i = 0; i < 3; i++)C[i] = 10;
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+ float* C_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+ cudaMalloc((void**)&C_gpu, 3*sizeof(float));
+
+ cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice);
+
+ gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu);
+
+ cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost);
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ D[i] += A[j][i]*B[j];
+ }
+ }
+
+ for(int i = 0; i < 3; i++)
+ {
+ ASSERT_EQ(C[i],D[i]+10);
+ }
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+ cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestAxpyGPU) {
+ float A[4][3] = {};
+ float C[4][3] = {};
+ float B[3][4] = {};
+ float D[3][4] = {};
+
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 3; j++)
+ {
+ A[i][j] = i-j + i*j;
+ B[j][i] = i-j + i*j;
+ C[i][j] = A[i][j];
+ D[j][i] = B[j][i];
+ }
+ }
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 3*4*sizeof(float));
+
+ cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice);
+
+ gpu_axpy(A_gpu, 12, 2, B_gpu);
+
+ cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+ for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ ASSERT_EQ(B[i][j],D[i][j]);
+ }
+ }
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+}
+
+
+TEST(MathTest, TestDotGPU) {
+ float A[12];
+ float B[12];
+
+ for(int i = 0; i < 12; i++)
+ {
+ A[i]=i-1;
+ B[i]=i+1;
+ }
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 12*sizeof(float));
+
+ cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice);
+ float gpu_ret=gpu_dot(A_gpu,B_gpu,12);
+
+ float cpu_ret=0.0f;
+ for(int i = 0; i < 12; i++)
+ {
+ cpu_ret+=A[i]*B[i];
+ }
+
+ ASSERT_EQ(gpu_ret,cpu_ret);
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+
+}
+
+TEST(MathTest, TestSingaSumColGPU) {
+
+ float A[3][4];
+ float B[4];
+ float C[4];
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ A[i][j]=i+j;
+ }
+ }
+
+ for(int i = 0; i < 4; i++)
+ {
+ B[i]=0.0f;
+ C[i]=0.0f;
+ }
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+ cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+
+ singa_sum_col(A_gpu,B_gpu,3,4,4);
+
+ cudaMemcpy(B,B_gpu,4*sizeof(float),cudaMemcpyDeviceToHost);
+
+ for(int i = 0; i < 4; i++)
+ {
+ for(int j = 0; j < 3; j++)
+ {
+ C[i]+=A[j][i];
+ }
+ }
+
+ for(int i = 0; i <4; i++)
+ {
+ ASSERT_EQ(B[i],C[i]);
+ }
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+}
+
+TEST(MathTest, TestSingaAddVecRowGPU) {
+
+ float A[3][4];
+ float B[4];
+ float C[3][4];
+ float D[3][4];
+
+ for(int i = 0; i < 4; i++)
+ {
+ B[i]=i;
+ }
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ A[i][j]=i+j;
+ D[i][j]=A[i][j]+B[j];
+ }
+ }
+
+
+ float* A_gpu=NULL;
+ float* B_gpu=NULL;
+ float* C_gpu=NULL;
+
+ cudaMalloc((void**)&A_gpu, 3*4*sizeof(float));
+ cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+ cudaMalloc((void**)&C_gpu, 3*4*sizeof(float));
+ cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+
+ singa_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4);
+
+ cudaMemcpy(C,C_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+ for(int i = 0; i < 3; i++)
+ {
+ for(int j = 0; j < 4; j++)
+ {
+ ASSERT_EQ(C[i][j],D[i][j]);
+ }
+ }
+
+ cudaFree(A_gpu);
+ cudaFree(B_gpu);
+ cudaFree(C_gpu);
+}