You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ma...@apache.org on 2022/05/09 21:45:08 UTC

[systemds] branch main updated (81de34ac30 -> 72082ebe23)

This is an automated email from the ASF dual-hosted git repository.

markd pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


    from 81de34ac30 [MINOR] Cleaning pipelines minor cleanups (refactor function name)
     new d418766c1e [MINOR] Update jitify
     new 14d095efe5 [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)
     new 72082ebe23 [SYSTEMDS-3362] CUDA code generation binaries for latest bugfix

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 285976 -> 290480 bytes
 src/main/cuda/ext/jitify                           |   2 +-
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h    |   9 +++++--
 src/main/cuda/spoof-launcher/SpoofCellwise.h       |  30 ++++++++++-----------
 src/main/cuda/spoof-launcher/SpoofOperator.h       |   6 ++---
 src/main/cuda/spoof-launcher/SpoofRowwise.h        |   6 ++---
 6 files changed, 28 insertions(+), 25 deletions(-)


[systemds] 02/03: [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 14d095efe5bc98d120bc0dd34270c3f12747b3cc
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Thu Apr 28 14:06:53 2022 +0200

    [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix)
    
    The CUDA code generation launcher handles streams per operator at the moment. This is wrong since a read before write can happen on a certain device allocation. Switching to a central stream object for now.
    
    Closes #1600
---
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h |  9 ++++++--
 src/main/cuda/spoof-launcher/SpoofCellwise.h    | 30 ++++++++++++-------------
 src/main/cuda/spoof-launcher/SpoofOperator.h    |  6 ++---
 src/main/cuda/spoof-launcher/SpoofRowwise.h     |  6 ++---
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index e4b80c5e40..c902c38382 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -55,9 +55,14 @@ public:
 	size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize)
 	std::byte* staging_buffer{}; // pinned host mem for async transfers
 	std::byte* device_buffer{};  // this buffer holds the pointers to the data buffers
+	cudaStream_t stream{};
 
 	explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string>  include_paths_) : reductions(nullptr),
-			resource_path(resource_path_), include_paths(std::move(include_paths_)) { }
+			resource_path(resource_path_), include_paths(std::move(include_paths_)) {
+			    CHECK_CUDART(cudaStreamCreate(&stream));
+            }
+
+    virtual ~SpoofCUDAContext() { CHECK_CUDART(cudaStreamDestroy(stream)); }
 
 	static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
 
@@ -70,7 +75,7 @@ public:
 
 		DataBufferWrapper dbw(staging_buffer, device_buffer);
 		SpoofOperator* op = compiled_ops[dbw.op_id()].get();
-		dbw.toDevice(op->stream);
+		dbw.toDevice(stream);
 
 		CALL::exec(this, op, &dbw);
 
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index 9077840020..68b176b6f2 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -27,7 +27,7 @@
 template<typename T>
 struct SpoofCellwiseFullAgg {
 	
-	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
+	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
 		T value_type;
 		
 		// num ctas
@@ -46,7 +46,7 @@ struct SpoofCellwiseFullAgg {
 #endif
 		CHECK_CUDA(op->program.get()->kernel(op_name)
 						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
-						   .configure(grid, block, shared_mem_size, op->stream)
+						   .configure(grid, block, shared_mem_size, ctx->stream)
 						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 		if(NB > 1) {
@@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg {
                     << N << " elements"
                     << std::endl;
 #endif
-				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr));
+				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, ctx->stream, args, nullptr));
 				N = NB;
 			}
 		}
@@ -74,7 +74,7 @@ struct SpoofCellwiseFullAgg {
 
 template<typename T>
 struct SpoofCellwiseRowAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
 		T value_type;
 		
 		// num ctas
@@ -90,7 +90,7 @@ struct SpoofCellwiseRowAgg {
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
 						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
-						   .configure(grid, block, shared_mem_size, op->stream)
+						   .configure(grid, block, shared_mem_size, ctx->stream)
 						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
@@ -99,7 +99,7 @@ struct SpoofCellwiseRowAgg {
 
 template<typename T>
 struct SpoofCellwiseColAgg {
-	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
+	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
 		T value_type;
 		
 		// num ctas
@@ -115,7 +115,7 @@ struct SpoofCellwiseColAgg {
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
 						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
-						   .configure(grid, block, shared_mem_size, op->stream)
+						   .configure(grid, block, shared_mem_size, ctx->stream)
 						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
@@ -124,7 +124,7 @@ struct SpoofCellwiseColAgg {
 
 template<typename T>
 struct SpoofCellwiseNoAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) {
 		T value_type;
 		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		
@@ -155,16 +155,16 @@ struct SpoofCellwiseNoAgg {
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
 						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
-						   .configure(grid, block, shared_mem_size, op->stream)
+						   .configure(grid, block, shared_mem_size, ctx->stream)
 						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 
 		// copy over row indices from input to output if appropriate
 		if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) {
 			// src/dst information (pointer address) is stored in *host* buffer!
 			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, dbw->h_in<T>(0)->row_ptr,
-				(dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+				(dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, ctx->stream));
 			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, dbw->h_in<T>(0)->col_idx,
-										 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+										 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, ctx->stream));
 		}
 	}
 };
@@ -186,16 +186,16 @@ struct SpoofCellwise {
 		switch(op->agg_type) {
 			case SpoofOperator::AggType::FULL_AGG:
 				op->agg_kernel = ctx->template getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op));
-				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw);
+				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw, ctx);
 				break;
 			case SpoofOperator::AggType::ROW_AGG:
-				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw);
+				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw, ctx);
 				break;
 			case SpoofOperator::AggType::COL_AGG:
-				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw);
+				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw, ctx);
 				break;
 			case SpoofOperator::AggType::NO_AGG:
-				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw);
+				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw, ctx);
 				break;
 			default:
 				throw std::runtime_error("unknown cellwise agg type" + std::to_string(static_cast<int>(op->agg_type)));
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h b/src/main/cuda/spoof-launcher/SpoofOperator.h
index f256e817db..045dcfdb80 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -42,10 +42,8 @@ struct SpoofOperator {
 	
 	[[nodiscard]] virtual bool isSparseSafe() const = 0;
 
-	cudaStream_t stream{};
-	
-	SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));}
-	virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));}
+	SpoofOperator() = default;
+	virtual ~SpoofOperator() = default;
 };
 
 struct SpoofCellwiseOp : public SpoofOperator {
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 01ec5206aa..a9a656fbb7 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -39,7 +39,7 @@ struct SpoofRowwise {
 			if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0)
 				out_num_elements = dbw->h_out<T>()->nnz;
 		//ToDo: only memset output when there is an output operation that *adds* to the buffer
-		CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), op->stream));
+		CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), ctx->stream));
 
 		//ToDo: handle this in JVM
 		uint32_t tmp_len = 0;
@@ -52,7 +52,7 @@ struct SpoofRowwise {
 			std::cout << "num_temp_vect: " << op->num_temp_vectors << " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
 #endif
 			CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
-			CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, op->stream));
+			CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, ctx->stream));
 		}
 
 		std::string op_name(op->name + "_DENSE");
@@ -68,7 +68,7 @@ struct SpoofRowwise {
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
 						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), dbw->num_sides()), op->num_temp_vectors, tmp_len)
-						   .configure(grid, block, shared_mem_size, op->stream)
+						   .configure(grid, block, shared_mem_size, ctx->stream)
 						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix()));
 		
 		if(op->num_temp_vectors > 0)


[systemds] 01/03: [MINOR] Update jitify

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit d418766c1eff6d5f8f16c965f5991e27f33a0d87
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Thu Apr 28 14:05:51 2022 +0200

    [MINOR] Update jitify
    
    This change pulls in the latest commits from upstream jitify. This includes a local fix of an ignored PR that was eventually fixed by someone else.
---
 src/main/cuda/ext/jitify | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/cuda/ext/jitify b/src/main/cuda/ext/jitify
index d9aed99c6d..4da8c56bb0 160000
--- a/src/main/cuda/ext/jitify
+++ b/src/main/cuda/ext/jitify
@@ -1 +1 @@
-Subproject commit d9aed99c6d022005ee15a8ec50deab391911f600
+Subproject commit 4da8c56bb0f65a6dd993e978b1818098b0929d9a


[systemds] 03/03: [SYSTEMDS-3362] CUDA code generation binaries for latest bugfix

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 72082ebe23e49694dc2aeaf0e229c2f7c1b2395c
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Thu Apr 28 14:12:33 2022 +0200

    [SYSTEMDS-3362] CUDA code generation binaries for latest bugfix
    
    Code gen native support compiled on Ubuntu 20 LTS/CUDA-10.2
---
 .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 285976 -> 290480 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so
index 81d1184b18..70f0d81275 100644
Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ