You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2019/03/29 05:44:40 UTC
[systemml] branch master updated: [SYSTEMML-540] Optimized
sparse-to-dense conversion on GPU and added a flag to disable forced
memset0
This is an automated email from the ASF dual-hosted git repository.
niketanpansare pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push:
new 70bf610 [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0
70bf610 is described below
commit 70bf61093dc3814ccbec867de4e4753cb9f3e086
Author: Niketan Pansare <np...@us.ibm.com>
AuthorDate: Thu Mar 28 22:44:24 2019 -0700
[SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag to disable forced memset0
- Improved the performance of sparse-to-dense conversion of empty matrices.
- Added a flag sysml.gpu.force.memSetZero that allows the user to disable forced memset0.
- This flag is turned on for now and after exhaustive testing, it will be turned off later by default.
---
conf/SystemML-config.xml.template | 3 +++
src/main/java/org/apache/sysml/conf/DMLConfig.java | 4 +++-
.../instructions/gpu/context/CSRPointer.java | 3 +++
.../instructions/gpu/context/GPUMemoryManager.java | 20 ++++++++++++++++----
.../instructions/gpu/context/GPUObject.java | 22 ++++++++++++++++++----
5 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 17cc2cc..cd0d311 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -121,4 +121,7 @@
<!-- Should SystemML runtime force the lstm builtin functions to use the CuDNN kernels (default: true) -->
<sysml.gpu.lstm.force.cudnn>true</sysml.gpu.lstm.force.cudnn>
+
+ <!-- Should SystemML GPU memory manager force memSet(0) for the allocated arrays (default: true) -->
+ <sysml.gpu.force.memSetZero>true</sysml.gpu.force.memSetZero>
</root>
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 0b5ed78..e435c77 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -96,6 +96,7 @@ public class DMLConfig
public static final String GPU_MEMORY_ALLOCATOR = "sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. Supported values are: cuda, unified_memory
public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single
public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo";
+ public static final String GPU_FORCE_MEMSET_ZERO = "sysml.gpu.force.memSetZero";
public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize";
public static final String GPU_RECOMPUTE_ACTIVATIONS = "sysml.gpu.recompute.activations";
@@ -140,6 +141,7 @@ public class DMLConfig
_defaultVals.put(NATIVE_BLAS_DIR, "none" );
_defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
_defaultVals.put(PRINT_GPU_MEMORY_INFO, "false" );
+ _defaultVals.put(GPU_FORCE_MEMSET_ZERO, "true" );
_defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.5" );
_defaultVals.put(STATS_MAX_WRAP_LEN, "30" );
_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" );
@@ -431,7 +433,7 @@ public class DMLConfig
YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM,
CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR,
COMPRESSED_LINALG,
- CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
+ CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, GPU_FORCE_MEMSET_ZERO,
EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE,
GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS, FORCE_LSTM_CUDNN
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index b3ec497..d7bd295 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -303,6 +303,9 @@ public class CSRPointer {
r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+ GPUMemoryManager.postAllocateMemset0(r.val, getDataTypeSizeOf(nnz2), null);
+ GPUMemoryManager.postAllocateMemset0(r.rowPtr, getIntSizeOf(rows + 1), null);
+ GPUMemoryManager.postAllocateMemset0(r.colInd, getIntSizeOf(nnz2), null);
return r;
}
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index cf579ec..d15b953 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -57,6 +57,7 @@ public class GPUMemoryManager {
private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
+ public static boolean GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
protected final GPUMemoryAllocator allocator;
/*****************************************************************************************/
@@ -141,6 +142,7 @@ public class GPUMemoryManager {
private static final double WARN_UTILIZATION_FACTOR = 0.7;
public GPUMemoryManager(GPUContext gpuCtx) {
+ GPU_FORCE_MEMSET_ZERO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
matrixMemoryManager = new GPUMatrixMemoryManager(this);
lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this);
String allocatorType = ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR);
@@ -361,12 +363,22 @@ public class GPUMemoryManager {
+ toString());
}
- long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
- cudaMemset(A, 0, size);
- addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+ if(GPU_FORCE_MEMSET_ZERO) {
+ long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
+ cudaMemset(A, 0, size);
+ addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+ }
return A;
}
+ public static void postAllocateMemset0(Pointer A, long size, String opcode) {
+ if(!GPU_FORCE_MEMSET_ZERO) {
+ long t0 = ConfigurationManager.isStatistics() ? System.nanoTime() : 0;
+ cudaMemset(A, 0, size);
+ addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+ }
+ }
+
private int worstCaseContiguousMemorySizeCompare(GPUObject o1, GPUObject o2) {
long ret = o1.getWorstCaseContiguousMemorySize() - o2.getWorstCaseContiguousMemorySize();
return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
@@ -553,7 +565,7 @@ public class GPUMemoryManager {
* @param instructionLevelTimer member of GPUInstruction
* @param startTime start time
*/
- private void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) {
+ private static void addMiscTime(String opcode, LongAdder globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) {
if(ConfigurationManager.isStatistics()) {
long totalTime = System.nanoTime() - startTime;
globalGPUTimer.add(totalTime);
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 9d263aa..254c9d7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -434,9 +434,15 @@ public class GPUObject {
start = System.nanoTime();
if (getJcudaSparseMatrixPtr() == null || !isAllocated())
throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
-
- sparseToColumnMajorDense();
- denseColumnMajorToRowMajor();
+ if(getJcudaSparseMatrixPtr().nnz == 0) {
+ long size = ((long) mat.getNumRows()) * getDataTypeSizeOf(mat.getNumColumns());
+ setDensePointer(allocate(size));
+ GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, instructionName);
+ }
+ else {
+ sparseToColumnMajorDense();
+ denseColumnMajorToRowMajor();
+ }
if (ConfigurationManager.isStatistics())
end = System.nanoTime();
if (instructionName != null && ConfigurationManager.isFinegrainedStatistics())
@@ -446,6 +452,10 @@ public class GPUObject {
if (ConfigurationManager.isStatistics())
GPUStatistics.cudaSparseToDenseCount.add(1);
}
+
+ private static long getDataTypeSizeOf(long numElems) {
+ return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+ }
/**
* More efficient method to convert sparse to dense but returns dense in column major format
@@ -521,10 +531,14 @@ public class GPUObject {
setDensePointer(allocate(size));
// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
- if (v != 0)
+ if (v != 0) {
getGPUContext().getKernels()
.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
getDensePointer(), v, numElems);
+ }
+ else {
+ GPUMemoryManager.postAllocateMemset0(getDensePointer(), size, null);
+ }
}
/**