You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2019/03/29 05:44:40 UTC

[systemml] branch master updated: [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0

This is an automated email from the ASF dual-hosted git repository.

niketanpansare pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 70bf610  [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0
70bf610 is described below

commit 70bf61093dc3814ccbec867de4e4753cb9f3e086
Author: Niketan Pansare <np...@us.ibm.com>
AuthorDate: Thu Mar 28 22:44:24 2019 -0700

    [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0
    
    - Improved the performance of sparse-to-dense conversion of empty matrices.
    - Added a flag sysml.gpu.force.memSetZero that allows the user to disable forced memset0.
    - This flag is turned on for now and after exhaustive testing, it will be turned off later by default.
---
 conf/SystemML-config.xml.template                  |  3 +++
 src/main/java/org/apache/sysml/conf/DMLConfig.java |  4 +++-
 .../instructions/gpu/context/CSRPointer.java       |  3 +++
 .../instructions/gpu/context/GPUMemoryManager.java | 20 ++++++++++++++++----
 .../instructions/gpu/context/GPUObject.java        | 22 ++++++++++++++++++----
 5 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 17cc2cc..cd0d311 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -121,4 +121,7 @@
    
    <!-- Should SystemML runtime force the lstm builtin functions to use the CuDNN kernels (default: true) -->
    <sysml.gpu.lstm.force.cudnn>true</sysml.gpu.lstm.force.cudnn>
+   
+   <!-- Should SystemML GPU memory manager force memSet(0) for the allocated arrays (default: true) -->
+   <sysml.gpu.force.memSetZero>true</sysml.gpu.force.memSetZero>
 </root>
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 0b5ed78..e435c77 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -96,6 +96,7 @@ public class DMLConfig
 	public static final String GPU_MEMORY_ALLOCATOR = "sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. Supported values are: cuda, unified_memory
 	public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single
 	public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo";
+	public static final String GPU_FORCE_MEMSET_ZERO = "sysml.gpu.force.memSetZero";
 	public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize";
 	public static final String GPU_RECOMPUTE_ACTIVATIONS = "sysml.gpu.recompute.activations";
 
@@ -140,6 +141,7 @@ public class DMLConfig
 		_defaultVals.put(NATIVE_BLAS_DIR,        "none" );
 		_defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
 		_defaultVals.put(PRINT_GPU_MEMORY_INFO,  "false" );
+		_defaultVals.put(GPU_FORCE_MEMSET_ZERO,  "true" );
 		_defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.5" );
 		_defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
 		_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
@@ -431,7 +433,7 @@ public class DMLConfig
 				YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM, 
 				CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
 				COMPRESSED_LINALG, 
-				CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
+				CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, GPU_FORCE_MEMSET_ZERO,
 				EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
 				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE,
 				GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS, FORCE_LSTM_CUDNN
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index b3ec497..d7bd295 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -303,6 +303,9 @@ public class CSRPointer {
 		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
 		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
 		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+		GPUMemoryManager.postAllocateMemset0(r.val, getDataTypeSizeOf(nnz2), null);
+		GPUMemoryManager.postAllocateMemset0(r.rowPtr, getIntSizeOf(rows + 1), null);
+		GPUMemoryManager.postAllocateMemset0(r.colInd, getIntSizeOf(nnz2), null);
 		return r;
 	}
 
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index cf579ec..d15b953 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -57,6 +57,7 @@ public class GPUMemoryManager {
 	private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
 	
 	private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
+	public static boolean GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
 	
 	protected final GPUMemoryAllocator allocator;
 	/*****************************************************************************************/
@@ -141,6 +142,7 @@ public class GPUMemoryManager {
 	private static final double WARN_UTILIZATION_FACTOR = 0.7;
 	
 	public GPUMemoryManager(GPUContext gpuCtx) {
+		GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
 		matrixMemoryManager = new GPUMatrixMemoryManager(this);
 		lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this);
 		String allocatorType = ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR);
@@ -361,12 +363,22 @@ public class GPUMemoryManager {
 					+ toString());
 		}
 		
-		long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
-		cudaMemset(A, 0, size);
-		addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+		if(GPU_FORCE_MEMSET_ZERO) {
+			long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
+			cudaMemset(A, 0, size);
+			addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+		}
 		return A;
 	}
 	
+	public static void postAllocateMemset0(Pointer A, long size, String opcode) {
+		if(!GPU_FORCE_MEMSET_ZERO) {
+			long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
+			cudaMemset(A, 0, size);
+			addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+		}
+	}
+	
 	private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) {
 		long ret = o1.getWorstCaseContiguousMemorySize() - o2.getWorstCaseContiguousMemorySize();
 		return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
@@ -553,7 +565,7 @@ public class GPUMemoryManager {
 	 * @param instructionLevelTimer member of GPUInstruction
 	 * @param startTime start time
 	 */
-	private void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) {
+	private static void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) {
 		if(ConfigurationManager.isStatistics()) {
 			long totalTime = System.nanoTime() - startTime;
 			globalGPUTimer.add(totalTime);
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 9d263aa..254c9d7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -434,9 +434,15 @@ public class GPUObject {
 			start = System.nanoTime();
 		if (getJcudaSparseMatrixPtr() == null || !isAllocated())
 			throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
-
-		sparseToColumnMajorDense();
-		denseColumnMajorToRowMajor();
+		if(getJcudaSparseMatrixPtr().nnz == 0) {
+			long size = ((long) mat.getNumRows()) * getDataTypeSizeOf(mat.getNumColumns());
+			setDensePointer(allocate(size));
+			GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, instructionName);
+		}
+		else {
+			sparseToColumnMajorDense();
+			denseColumnMajorToRowMajor();
+		}
 		if (ConfigurationManager.isStatistics())
 			end = System.nanoTime();
 		if (instructionName != null && ConfigurationManager.isFinegrainedStatistics())
@@ -446,6 +452,10 @@ public class GPUObject {
 		if (ConfigurationManager.isStatistics())
 			GPUStatistics.cudaSparseToDenseCount.add(1);
 	}
+	
+	private static long getDataTypeSizeOf(long numElems) {
+		return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+	}
 
 	/**
 	 * More efficient method to convert sparse to dense but returns dense in column major format
@@ -521,10 +531,14 @@ public class GPUObject {
 		setDensePointer(allocate(size));
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
-		if (v != 0)
+		if (v != 0) {
 			getGPUContext().getKernels()
 			.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
 					getDensePointer(), v, numElems);
+		}
+		else {
+			GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, null);
+		}
 	}
 
 	/**