You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2015/11/16 07:08:49 UTC
[03/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

SINGA-80 New Blob Level and Address Level Math Operation Interface


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c13e0370
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c13e0370
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c13e0370

Branch: refs/heads/master
Commit: c13e0370906371ca001e755268c51e656dfde0fa
Parents: bbb7dbc
Author: seaok <se...@gmail.com>
Authored: Fri Oct 30 10:02:22 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Nov 9 17:04:48 2015 +0800

----------------------------------------------------------------------
 Makefile.gpu                     | 153 ++++++++++++++
 include/singa/blob/math_kernel.h |  14 ++
 src/blob/math_addr.cc            |  64 ++++++
 src/blob/math_blob.cc            | 148 +++++++------
 src/blob/math_kernel.cu          |  78 +++++++
 src/test/test_math.cc            | 385 ++++++++++++++++++++++++++++++++++
 6 files changed, 775 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
new file mode 100644
index 0000000..9c9051c
--- /dev/null
+++ b/Makefile.gpu
@@ -0,0 +1,153 @@
+#/**
+# * Copyright 2015 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements.  See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership.  The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+###################User Config Varaibles #############################
+# third-party library installation folder
+HOME_DIR := /usr
+
+CUDA_DIR := /usr/local/cuda
+
+#CUDA_DIR :=
+
+# Lib folder for system and external libs. You may need to change it.
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib $(CUDA_DIR)/lib64
+# Header folder for system and external libs. You may need to change it.
+INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include
+# g++ location, should support c++11, tested with 4.8.1
+CXX := g++
+CUCXX := nvcc
+
+######################Setting Varialbes#######################################
+LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
+
+ifneq ($(CUDA_DIR),)
+	LIBRARIES := $(LIBRARIES) cublas cudart curand
+endif
+
+LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
+	$(foreach library, $(LIBRARIES), -l$(library))
+# Folder to store compiled files
+BUILD_DIR := .libs
+MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+ZK_FLAGS :=-DTHREADED -fpermissive
+CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
+	$(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\
+	-funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+CUCXXFLAGS := $(MSHADOW_FLAGS) -std=c++11 -G $(CUDA_ARCH) \
+	$(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+
+#Add device compile option
+ifeq ($(CUDA_DIR),)
+	MSHADOW_FLAGS := $(MSHADOW_FLAGS) -DCPU_ONLY
+	CXXFLAGS := $(CXXFLAGS) -DCPU_ONLY
+endif
+
+# find user defined .proto file, and then compute the corresponding .h, .cc
+# files, which cannot be found by shell find, because they haven't been
+# generated currently
+PROTOS := $(shell find src/proto/ -name "*.proto")
+PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
+PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
+PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
+
+# each singa src file will generate a .o file
+SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o -path "src/utils/tool.cc" \) \
+	-prune -o \( -name "*.cc" -type f \) -print )
+SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
+	$(PROTO_OBJS) )
+-include $(SINGA_OBJS:%.o=%.P)
+
+TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
+TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
+-include $(TEST_OBJS:%.o=%.P)
+
+TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu")
+TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o)))
+-include $(TEST_CUDA_OBJS:%.o=%.P)
+
+SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu")
+SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o)))
+-include $(SINGA_CUDA_OBJS:%.o=%.P)
+
+GTEST_SRC := include/gtest/gtest-all.cc
+GTEST_HDR := include/gtest/gtest.h
+GTEST_LIB := $(BUILD_DIR)/libgtest.a
+
+OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
+CUOBJS := $(sort $(SINGA_CUDA_OBJS) $(TEST_CUDA_OBJS) )
+
+########################Compilation Section###################################
+.PHONY: singa test
+
+singa: $(PROTO_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
+	$(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS)
+	$(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
+	@echo
+	$(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/utils/tool.cc -o singatool $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+loader: proto $(LOADER_OBJS)
+	$(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+test:  proto $(GTEST_LIB) $(TEST_OBJS) $(TEST_CUDA_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
+	$(CXX) $(TEST_OBJS) $(TEST_CUDA_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \
+		$(SINGA_OBJS) $(SINGA_CUDA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS)
+	@echo
+
+$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
+	$(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
+	ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
+
+# compile all files
+$(OBJS):$(BUILD_DIR)/%.o : %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $<  $(CXXFLAGS) -MMD -c -o $@
+	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
+	rm -f $*.d
+
+$(CUOBJS):$(BUILD_DIR)/%.o : %.cu
+	@mkdir -p $(dir $@)
+	$(CUCXX) $< -c -o $@ $(CUCXXFLAGS)
+	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
+	rm -f $*.d
+
+proto: $(PROTO_OBJS)
+
+$(PROTO_SRCS): $(PROTOS)
+	protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
+	mkdir -p include/proto/
+	cp src/proto/*.pb.h include/proto/
+	mkdir -p tool/pb2/
+	touch tool/pb2/__init__.py
+	protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)
+	@echo
+
+clean:
+	rm -rf *.a *.so
+	rm -rf include/proto/*
+	rm -rf src/proto/*.pb.h src/proto/*.pb.cc
+	rm -rf tool/pb2/*
+	rm -rf $(BUILD_DIR)
+	@echo

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/include/singa/blob/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h
new file mode 100644
index 0000000..9aaf4c2
--- /dev/null
+++ b/include/singa/blob/math_kernel.h
@@ -0,0 +1,14 @@
+#ifndef MATH_KERNEL_H
+#define MATH_KERNEL_H
+
+namespace singa{
+
+extern "C" {
+	void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride);
+
+	void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data, long rows, long cols, long stride);
+};
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_addr.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc
index f28fdcb..799a749 100644
--- a/src/blob/math_addr.cc
+++ b/src/blob/math_addr.cc
@@ -3,6 +3,9 @@ extern "C"
    #include <cblas.h>
 }
 
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
 #include "singa/blob/math_addr.h"
 #include "singa/blob/singa_op.h"
 
@@ -47,5 +50,66 @@ float cpu_dot(const float * A, const float * B, const int n)
 	return sum;
 }
 
+//Trick: swap A and B
+//
+void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C)
+{
+    int lda = TranA ? m : k;
+    int ldb = TranB ? k : n;
+    int ldc = n;
+
+    cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+
+    cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, ldc);
+
+    cublasDestroy(handle);
+}
+
+void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C)
+{
+	int lda = n ;
+	cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+
+	cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 1);
+
+	cublasDestroy(handle);
+
+}
+
+
+void gpu_axpy(const float * A, const int n, const float alpha, float * B)
+{
+
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+
+	cublasSaxpy(handle,n,&alpha,A,1,B,1);
+
+	cublasDestroy(handle);
+
+}
+
+
+float gpu_dot(const float * A, const float * B, const int n)
+{
+	cublasHandle_t handle;
+	cublasCreate(&handle);
+
+	float result=0.0;
+
+	cublasSdot(handle,n,A,1,B,1,&result);
+
+	cublasDestroy(handle);
+
+	return result;
+
+}
 
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_blob.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc
index ff81667..9421367 100644
--- a/src/blob/math_blob.cc
+++ b/src/blob/math_blob.cc
@@ -1,4 +1,5 @@
 #include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
 
 namespace singa {
 
@@ -18,24 +19,26 @@ int get_size(const std::vector<int>& shape)
 
 void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha, float beta)
 {
-	if(xpu == cpu)
+	if(check_shape_mmm(A, B, *C))
 	{
-	  if(check_shape_mmm(A, B, *C))
-	  {
-	    int m = C->shape().at(0);
+		int m = C->shape().at(0);
 	    int n = C->shape().at(1);
 	    int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
 	    bool TranA = A.isTranspose();
 	    bool TranB = B.isTranspose();
-	    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data());
-	  }
-	  else{
-	  // report errors here
-	  }
+
+		if(xpu == cpu)
+		{
+			cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data());
+		}
+		if(xpu == gpu)
+		{
+			//gpu part
+			gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
+		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+	  // report errors here
 	}
 }
 //C = alpha*A*B+beta*C, A, B and C are matrix
@@ -50,22 +53,24 @@ void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
 
 void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C)
 {
-	if(xpu == cpu)
+	if(check_shape_mvv(A, B, *C))
 	{
-		if(check_shape_mvv(A, B, *C))
+		int m = B.shape().at(0);
+		int n = C->shape().at(0);
+		bool TranA = A.isTranspose();
+
+		if(xpu == cpu)
 		{
-			int m = B.shape().at(0);
-			int n = C->shape().at(0);
-			bool TranA = A.isTranspose();
 			cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, C->mutable_cpu_data());
 		}
-		else{
-			// report errors here
+		if(xpu == gpu)
+		{
+			//gpu part
+			gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, C->mutable_gpu_data());
 		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+		// report errors here
 	}
 	
 }
@@ -74,21 +79,23 @@ void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
  
 void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C)
 {
-	if(xpu == cpu)
+	if(check_shape_vvm(A, B, *C))
 	{
-		if(check_shape_vvm(A, B, *C))
+		int m = C->shape().at(0);
+		int n = C->shape().at(1);
+
+		if(xpu == cpu)
 		{
-			int m = C->shape().at(0);
-			int n = C->shape().at(1);
 			cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false, C->mutable_cpu_data());
 		}
-		else{
-		// report errors here
+		if(xpu == gpu)
+		{
+			//gpu part
+			gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data());
 		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+	// report errors here
 	}
 }
 // C is matrix,A and B are vector
@@ -97,20 +104,21 @@ void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> *
 float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
 {
 	float res = 0;
-	if(xpu == cpu)
+	if(check_shape_equal(A, B, B))
 	{
-		if(check_shape_equal(A, B, B))
-		{
-			int n = get_size(A.shape());
+		int n = get_size(A.shape());
+		if(xpu == cpu)
+		{	
 			res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
 		}
-		else{
-		// report errors here
+		if(xpu == gpu)
+		{
+		//gpu part
+			res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
 		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+	// report errors here
 	}
 	return res;
 }
@@ -118,19 +126,20 @@ float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
 
 void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
 {
-	if(xpu == cpu)
+	if(check_shape_equal(A, *B, *B))
 	{
-		if(check_shape_equal(A, *B, *B))
+
+		if(xpu == cpu)
 		{
 			cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, B->mutable_cpu_data());
 		}
-		else{
-		// report errors here
+		if(xpu == gpu)
+		{
+			gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, B->mutable_gpu_data());
 		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+	// report errors here
 	}
 }
 // element-wise operation: Bi = alpha*Ai+Bi  A and B should have the same size
@@ -143,47 +152,52 @@ inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B)
 
 void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta)
 {
-	if(xpu == cpu)
+	if(check_shape_mv(*B, A))
 	{
-		if(check_shape_mv(*B, A))
+		int m = get_size(A.shape());
+		int n = get_size(B->shape()) / m;
+
+		if(xpu == cpu)
 		{
-			int m = get_size(A.shape());
-			int n = get_size(B->shape()) / m;
 			const float * univ = cpu_uni_vec(n);
 			cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, false, false, B->mutable_cpu_data());
 			delete univ;
 		}
-		else{
-		// report errors here
-		}
+
+		if(xpu == gpu)
+		{
+			singa_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n);
+		//gpu part
+		}	
+	}
+	else{
+	// report errors here
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
-	}	
 }
 // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
 // will use gemm. faster than general expand_f
 
 void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta)
 {
-	if(xpu == cpu)
+	if(check_shape_mv(A, *B))
 	{
-		if(check_shape_mv(A, *B))
+		int m = get_size(B->shape());
+		int n = get_size(A.shape()) / m;
+
+		if(xpu == cpu)
 		{
-			int m = get_size(B->shape());
-			int n = get_size(A.shape()) / m;
 			const float * univ = cpu_uni_vec(n);
 			cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, false, false, B->mutable_cpu_data());
 			delete univ;
 		}
-		else{
-		// report errors here
+		if(xpu == gpu)
+		{
+			singa_sum_col(A.gpu_data(),B->gpu_data(),m,n,n);
+		//gpu part
 		}
 	}
-	if(xpu == gpu)
-	{
-	  //gpu part
+	else{
+	// report errors here
 	}
 }
 // B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu
new file mode 100644
index 0000000..6b2a709
--- /dev/null
+++ b/src/blob/math_kernel.cu
@@ -0,0 +1,78 @@
+#include "singa/blob/math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024 
+#define CU1DBLOCKF 1024.0 
+
+
+//Cuda Kernel Functions
+__global__
+void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+	    int j = blockIdx.x;
+		int THREADS = blockDim.x;
+		if(j >= cols) {
+			return;
+		}
+
+		__shared__ float aux[CU1DBLOCK];
+		int steps = (rows - 1) / THREADS + 1;
+		aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
+		for(int i=1; i<steps; ++i) {
+			if(threadIdx.x+i*THREADS < rows) {
+				aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
+			}
+		}
+
+		int total_threads = THREADS;
+		__syncthreads();
+		while(total_threads > 1) {
+			int half_point = ((1+total_threads) >> 1);
+			if (threadIdx.x < half_point) {
+				if(threadIdx.x+half_point < total_threads) {
+					aux[threadIdx.x] += aux[threadIdx.x + half_point];															            
+				}
+			}
+			__syncthreads();
+			total_threads = ((total_threads+1) >> 1);
+		}
+
+		__syncthreads();
+		dst_vec_data[j] = aux[0];
+}
+
+__global__ 
+void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* des_mat_data,long rows, long cols, long stride)
+{
+	long i = blockIdx.x * blockDim.x + threadIdx.x;
+	long j = blockIdx.y * blockDim.y + threadIdx.y;
+	long num_threads_x = blockDim.x * gridDim.x;
+	long num_threads_y = blockDim.y * gridDim.y;
+	long index = 0;
+	for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
+		index = j * stride + i;
+		des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+	}
+}
+
+//
+namespace singa{
+
+void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride)
+{
+	long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+	long num_blocks = cols;
+
+	kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride);
+}
+
+void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
+{
+	dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+	dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
+	kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride);
+}
+
+}//namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
new file mode 100644
index 0000000..3856d1d
--- /dev/null
+++ b/src/test/test_math.cc
@@ -0,0 +1,385 @@
+#include "gtest/gtest.h"
+#include "singa/blob/math_addr.h"
+#include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
+#include "singa/blob/singa_op.h"
+
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
+using namespace singa;
+using namespace std;
+
+TEST(MathTest, TestGemmCPU) {
+	float A[3][2] = {};
+	float B[3][2] = {};
+	float C[2][2] = {};
+	for(int i = 0; i < 3; i++)
+		for(int j = 0; j < 2; j++)
+		{
+			A[i][j] = i+j;
+			B[i][j] = i+j - i*j;
+		}
+	cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]);
+	float D[2][2] = {};
+	for(int i = 0; i < 2; i++)
+		for(int j = 0; j < 2; j++)
+		{
+			D[i][j] = 0;
+			for(int k = 0; k < 3; k++)
+				D[i][j] += A[k][i]*B[k][j];
+		}
+    for(int i = 0; i < 2; i++)
+        for(int j = 0; j < 2; j++)
+        {
+			ASSERT_EQ(C[i][j], D[i][j]);
+		}
+}
+
+TEST(MathTest, TestGemvCPU) {
+	float A[4][3] = {}; 
+	float B[4]= {}; 
+	float C[3] = {}; 
+	float D[3] = {}; 
+
+	for(int i = 0; i < 4; i++)
+	{
+		for(int j = 0; j < 3; j++)
+		{
+			A[j][i] = i-j + i*j;
+		}
+	}
+
+	for(int i = 0; i < 4; i++)B[i] = i;
+	for(int i = 0; i < 3; i++)C[i] = 10; 
+	cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); 
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			D[i] += A[j][i]*B[j];
+		}
+	}
+	for(int i = 0; i < 3; i++)
+	{
+		ASSERT_EQ(C[i], D[i]+10);
+	}
+}
+
+
+TEST(MathTest, TestAxpyCPU) {
+	float A[4][3] = {}; 
+	float C[4][3] = {}; 
+	float B[3][4] = {}; 
+	float D[3][4] = {};
+
+	for(int i = 0; i < 4; i++)
+	{
+		for(int j = 0; j < 3; j++)
+		{
+			A[i][j] = i-j + i*j;
+			B[j][i] = i-j + i*j;
+			C[i][j] = A[i][j];
+			D[j][i] = B[j][i];
+		}
+	}
+
+	cpu_axpy(A[0], 12, 2, B[0]);
+	for(int i = 0; i < 12; i++)
+	{
+		D[0][i] += 2*C[0][i];
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			ASSERT_EQ(B[i][j],D[i][j]);
+		}
+	}
+}
+
+
+TEST(MathTest, TestGemmGPU) {
+	float A[3][2] = {};
+	float B[3][2] = {};
+	float C[2][2] = {};
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 2; j++)
+		{
+			A[i][j] = i+j;
+			B[i][j] = i+j - i*j;
+		}
+	}
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+	float* C_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 3*2*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 3*2*sizeof(float));
+	cudaMalloc((void**)&C_gpu, 2*2*sizeof(float));
+
+	cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice);
+
+	gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu);
+
+	cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost);
+
+	float D[2][2] = {};
+	for(int i = 0; i < 2; i++)
+	{
+		for(int j = 0; j < 2; j++)
+		{
+			D[i][j] = 0;
+			for(int k = 0; k < 3; k++)
+			{
+				D[i][j] += A[k][i]*B[k][j];
+			}
+		}
+	}
+
+	for(int i = 0; i < 2; i++)
+	{
+		for(int j = 0; j < 2; j++)
+		{
+			ASSERT_EQ(C[i][j],D[i][j]);
+		}
+	}
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+	cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestGemvGPU) {
+	float A[4][3] = {};
+	float B[4]= {};
+	float C[3] = {};
+	float D[3] = {};
+
+	for(int i = 0; i < 4; i++)
+	{
+		for(int j = 0; j < 3; j++)
+		{
+			A[i][j] = i-j + i*j;
+		}
+	}
+
+	for(int i = 0; i < 4; i++)B[i] = i;
+	for(int i = 0; i < 3; i++)C[i] = 10;
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+	float* C_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+	cudaMalloc((void**)&C_gpu, 3*sizeof(float));
+
+	cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice);
+
+	gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu);
+
+	cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost);
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			D[i] += A[j][i]*B[j];
+		}
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		ASSERT_EQ(C[i],D[i]+10);
+	}
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+	cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestAxpyGPU) {
+	float A[4][3] = {};
+	float C[4][3] = {};
+	float B[3][4] = {};
+	float D[3][4] = {};
+
+	for(int i = 0; i < 4; i++)
+	{
+		for(int j = 0; j < 3; j++)
+		{
+			A[i][j] = i-j + i*j;
+			B[j][i] = i-j + i*j;
+			C[i][j] = A[i][j];
+			D[j][i] = B[j][i];
+		}
+	}
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 3*4*sizeof(float));
+
+	cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice);
+
+	gpu_axpy(A_gpu, 12, 2, B_gpu);
+
+	cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost);
+	cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+	for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			ASSERT_EQ(B[i][j],D[i][j]);
+		}
+	}
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+}
+
+
+TEST(MathTest, TestDotGPU) {
+	float A[12];
+	float B[12];
+
+	for(int i = 0; i < 12; i++)
+	{
+		A[i]=i-1;
+		B[i]=i+1;
+	}
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 12*sizeof(float));
+
+	cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice);
+	float gpu_ret=gpu_dot(A_gpu,B_gpu,12);
+
+	float cpu_ret=0.0f;
+	for(int i = 0; i < 12; i++)
+	{
+		cpu_ret+=A[i]*B[i];
+	}
+
+	ASSERT_EQ(gpu_ret,cpu_ret);
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+
+}
+
+TEST(MathTest, TestSingaSumColGPU) {
+
+	float A[3][4];
+	float B[4];
+	float C[4];
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			A[i][j]=i+j;
+		}
+	}
+	
+	for(int i = 0; i < 4; i++)
+	{
+		B[i]=0.0f;
+		C[i]=0.0f;
+	}
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+	cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+
+	singa_sum_col(A_gpu,B_gpu,3,4,4);
+
+	cudaMemcpy(B,B_gpu,4*sizeof(float),cudaMemcpyDeviceToHost);
+
+	for(int i = 0; i < 4; i++)
+	{
+		for(int j = 0; j < 3; j++)
+		{
+			C[i]+=A[j][i];
+		}
+	}
+
+	for(int i = 0; i <4; i++)
+	{
+		ASSERT_EQ(B[i],C[i]);
+	}
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+}
+
+TEST(MathTest, TestSingaAddVecRowGPU) {
+
+	float A[3][4];
+	float B[4];
+	float C[3][4];
+	float D[3][4];
+
+	for(int i = 0; i < 4; i++)
+	{
+		B[i]=i;
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			A[i][j]=i+j;
+			D[i][j]=A[i][j]+B[j];
+		}
+	}
+
+
+	float* A_gpu=NULL;
+	float* B_gpu=NULL;
+	float* C_gpu=NULL;
+
+	cudaMalloc((void**)&A_gpu, 3*4*sizeof(float));
+	cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+	cudaMalloc((void**)&C_gpu, 3*4*sizeof(float));
+	cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice);
+	cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+
+	singa_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4);
+
+	cudaMemcpy(C,C_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+	for(int i = 0; i < 3; i++)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			ASSERT_EQ(C[i][j],D[i][j]);
+		}
+	}
+
+	cudaFree(A_gpu);
+	cudaFree(B_gpu);
+	cudaFree(C_gpu);
+}