You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/09/13 18:22:02 UTC
systemml git commit: [SYSTEMML-445] Removed unnecessary long-to-int
conversion in LSTM
Repository: systemml
Updated Branches:
refs/heads/master 77c98d693 -> e2dc85688
[SYSTEMML-445] Removed unnecessary long-to-int conversion in LSTM
- Minor cleanup of the GPUObject class.
- Also, fixed incorrect forced GPU configuration flag.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e2dc8568
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e2dc8568
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e2dc8568
Branch: refs/heads/master
Commit: e2dc8568855d353265ac4e0755b9ac3d2b30b1d8
Parents: 77c98d6
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 13 11:17:33 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 13 11:17:33 2018 -0700
----------------------------------------------------------------------
.../apache/sysml/conf/ConfigurationManager.java | 2 +-
.../instructions/gpu/DnnGPUInstruction.java | 20 +++---
.../instructions/gpu/context/CSRPointer.java | 8 ---
.../gpu/context/ExecutionConfig.java | 4 +-
.../gpu/context/GPUMemoryManager.java | 12 +++-
.../instructions/gpu/context/GPUObject.java | 72 ++++++++++----------
.../runtime/matrix/data/LibMatrixCuDNN.java | 38 +++++++----
.../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 56 ++++-----------
.../sysml/runtime/matrix/data/MatrixBlock.java | 3 +-
9 files changed, 100 insertions(+), 115 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
index d9f1906..96c3885 100644
--- a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
@@ -258,7 +258,7 @@ public class ConfigurationManager
* @return true if GPU is enabled in forced mode
*/
public static boolean isForcedGPU() {
- return _ldmlOptions.get().isGPU();
+ return _ldmlOptions.get().isForceGPU();
}
/**
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
index d620de9..6094b6c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -595,18 +595,18 @@ public class DnnGPUInstruction extends GPUInstruction {
private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException {
MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName());
- int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M)
+ long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M)
Pointer out0Pointer = LibMatrixCUDA.getDensePointer(gCtx, out0, instName);
MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName());
MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName());
long numRowsW = W.getNumRows();
- int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures
+ long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures
Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
- ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M);
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -619,7 +619,7 @@ public class DnnGPUInstruction extends GPUInstruction {
int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
- ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
xPointer, cudnnInput, N, D, T*D, N*T*D);
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -656,18 +656,19 @@ public class DnnGPUInstruction extends GPUInstruction {
// previous output out0 (also represented by hx) and cell state c0 (also represented by cx): (N, M) ==> (1, M, N)
// out: (N, T*M) or (N, M) ==> (T, M, N)
MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName());
- int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M)
+ long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M)
Pointer out0Pointer = LibMatrixCUDA.getDensePointer(gCtx, out0, instName);
MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName());
MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName());
long numRowsW = W.getNumRows();
- int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures
+ long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures
+
Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
- ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M);
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -682,13 +683,14 @@ public class DnnGPUInstruction extends GPUInstruction {
int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
- ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
xPointer, cudnnInput, N, D, T*D, N*T*D);
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, getMatrixInputForGPUInstruction(ec, _input5.getName()), instName);
- LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), N, M, D, T);
+ LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(),
+ toInt(N), toInt(M), toInt(D), toInt(T));
gCtx.cudaFreeHelper(instName, cudnnWPointer, gCtx.EAGER_CUDA_FREE);
gCtx.cudaFreeHelper(instName, cudnnInput, gCtx.EAGER_CUDA_FREE);
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index d7e38b9..135e0b1 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -476,14 +476,6 @@ public class CSRPointer {
}
/**
- * Calls cudaFree lazily on the allocated {@link Pointer} instances
- *
- */
- public void deallocate() {
- deallocate(getGPUContext().EAGER_CUDA_FREE);
- }
-
- /**
* Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
*
* @param eager whether to do eager or lazy cudaFrees
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 872fef7..d35e813 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -68,8 +68,8 @@ public class ExecutionConfig {
* @return execution configuration
*/
public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) {
- if(numCells == 0)
- throw new DMLRuntimeException("Attempting to invoke a kernel with 0 threads");
+ if(numCells <= 0)
+ throw new DMLRuntimeException("Attempting to invoke a kernel with " + numCells + " threads");
int deviceNumber = 0;
int blockDimX = getMaxBlockDim(deviceNumber);
int gridDimX = (int) Math.ceil((double) numCells / blockDimX);
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index e01c71a..509aafe 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -53,7 +53,7 @@ public class GPUMemoryManager {
// Developer flag: Use this flag to check for GPU memory leak in SystemML.
// This has an additional overhead of maintaining stack trace of all the allocated GPU pointers via PointerInfo class.
private static final boolean DEBUG_MEMORY_LEAK = false;
- private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10}; // Avoids printing too much text while debuggin
+ private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
@@ -86,7 +86,15 @@ public class GPUMemoryManager {
private Set<Pointer> getNonMatrixLockedPointers() {
Set<Pointer> managedPointers = matrixMemoryManager.getPointers();
managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
- return nonIn(allPointers.keySet(), managedPointers);
+ Set<Pointer> superSet = allPointers.keySet();
+ Set<Pointer> ret = nonIn(superSet, managedPointers);
+ if(DEBUG_MEMORY_LEAK) {
+ System.out.println(
+ ret.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + " = notIn(>>>" +
+ superSet.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>, <<<" +
+ managedPointers.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>)");
+ }
+ return ret;
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index cfab0d4..1564f48 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -91,11 +91,6 @@ public class GPUObject {
AtomicLong timestamp = new AtomicLong();
/**
- * Whether this block is in sparse format
- */
- protected boolean isSparse = false;
-
- /**
* Enclosing {@link MatrixObject} instance
*/
MatrixObject mat = null;
@@ -131,10 +126,29 @@ public class GPUObject {
/**
* Removes the dense pointer and potential soft reference
+ *
+ * @param opcode opcode of the instruction
+ * @param eager whether to delete eagerly
*/
- public void clearDensePointer() {
- jcudaDenseMatrixPtr = null;
+ public void clearDensePointer(String opcode, boolean eager) {
+ if (!isDensePointerNull()) {
+ getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
+ }
shadowBuffer.clearShadowPointer();
+ jcudaDenseMatrixPtr = null;
+ }
+
+ /**
+ * Removes the sparse pointer
+ *
+ * @param opcode opcode of the instruction
+ * @param eager whether to delete eagerly
+ */
+ public void clearSparsePointer(String opcode, boolean eager) {
+ if (getJcudaSparseMatrixPtr() != null) {
+ getJcudaSparseMatrixPtr().deallocate(eager);
+ }
+ jcudaSparseMatrixPtr = null;
}
@@ -147,14 +161,14 @@ public class GPUObject {
if (!this.isDensePointerNull()) {
throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
}
+ clearSparsePointer(null, true);
this.jcudaDenseMatrixPtr = densePtr;
- this.isSparse = false;
if(LOG.isDebugEnabled()) {
LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
}
- if (getJcudaSparseMatrixPtr() != null) {
- getJcudaSparseMatrixPtr().deallocate();
- jcudaSparseMatrixPtr = null;
+ if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) {
+ // Double-check if the matrix manager still has the current GPU object in case of eviction.
+ gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
}
}
// ----------------------------------------------------------------------
@@ -170,7 +184,6 @@ public class GPUObject {
that.writeLock = false;
that.timestamp = new AtomicLong(me.timestamp.get());
- that.isSparse = me.isSparse;
try {
if (!me.isDensePointerNull()) {
@@ -197,10 +210,6 @@ public class GPUObject {
return getGPUContext().allocate(null, size);
}
- private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
- getGPUContext().cudaFreeHelper(null, toFree, gpuContext.EAGER_CUDA_FREE);
- }
-
public GPUContext getGPUContext() {
return gpuContext;
}
@@ -300,11 +309,11 @@ public class GPUObject {
if (this.jcudaSparseMatrixPtr != null) {
throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
}
+ clearDensePointer(null, true);
this.jcudaSparseMatrixPtr = sparseMatrixPtr;
- this.isSparse = true;
- if (!isDensePointerNull() && !shadowBuffer.isBuffered()) {
- cudaFreeHelper(getDensePointer());
- clearDensePointer();
+ if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) {
+ // Double-check if the matrix manager still has the current GPU object in case of eviction.
+ gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
}
}
@@ -354,8 +363,7 @@ public class GPUObject {
}
Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
- cudaFreeHelper(getDensePointer());
- clearDensePointer();
+ clearDensePointer(null, true);
setDensePointer(tmp);
}
@@ -376,8 +384,7 @@ public class GPUObject {
}
Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
- cudaFreeHelper(getDensePointer());
- clearDensePointer();
+ clearDensePointer(null, true);
setDensePointer(tmp);
}
@@ -446,7 +453,7 @@ public class GPUObject {
}
public boolean isSparse() {
- return isSparse;
+ return jcudaSparseMatrixPtr != null;
}
private static long getDatatypeSizeOf(long numElems) {
@@ -602,7 +609,6 @@ public class GPUObject {
LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
}
boolean allocated = false;
- isSparse = true;
if (!isAllocated()) {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
@@ -995,22 +1001,15 @@ public class GPUObject {
* Clears the data associated with this {@link GPUObject} instance
*
* @param opcode opcode of the instruction
- * @param eager whether to be done synchronously or asynchronously
+ * @param eager whether to delete eagerly
* @throws DMLRuntimeException if error occurs
*/
public void clearData(String opcode, boolean eager) throws DMLRuntimeException {
if(LOG.isTraceEnabled()) {
LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext());
}
- if (!isDensePointerNull()) {
- getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
- }
- if (getJcudaSparseMatrixPtr() != null) {
- getJcudaSparseMatrixPtr().deallocate(eager);
- }
- clearDensePointer();
- shadowBuffer.clearShadowPointer();
- jcudaSparseMatrixPtr = null;
+ clearDensePointer(opcode, eager);
+ clearSparsePointer(opcode, eager);
resetReadWriteLock();
getGPUContext().getMemoryManager().removeGPUObject(this);
}
@@ -1039,7 +1038,6 @@ public class GPUObject {
sb.append(", dirty=").append(dirty);
sb.append(", readLocks=").append(readLocks.longValue());
sb.append(", writeLock=").append(writeLock);
- sb.append(", sparse? ").append(isSparse);
sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
if(!isDensePointerNull())
sb.append(", densePtr=").append(getDensePointer());
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index e7955e1..8051cbc 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -849,14 +849,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
static Pointer getDenseInputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String inputName,
long numRows, long numCols) throws DMLRuntimeException {
MatrixObject output = ec.getMatrixInputForGPUInstruction(inputName, instName);
- return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
+ return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols);
}
static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName,
long numRows, long numCols) throws DMLRuntimeException {
MatrixObject output = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix
- return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
+ return getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols);
}
/**
@@ -890,9 +890,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
String outputName, String cyName, // output
String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
+ if(LOG.isDebugEnabled()) {
+ long memRequired = (N*T*M + 2*N*M + N*T*M)*sizeOfDataType;
+ LOG.debug("Memory required for invoking lstmForward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors.");
+ }
+
// Get output pointers
Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
- Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
+ Pointer hyPointer = return_sequences ? gCtx.allocate(instName, N*M*sizeOfDataType) : getDenseOutputPointer(ec, gCtx, instName, outputName, N, M);
Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
@@ -922,20 +927,27 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName, // input
String dxName, String dwName, String dbName, String dhxName, String dcxName, // output
- boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
+ boolean return_sequences, long N, long M, long D, long T) throws DMLRuntimeException {
+
+ if(LOG.isDebugEnabled()) {
+ long memRequired = (N*T*M + (return_sequences ? T*M : M) + N*T*M + 2*N*T*D + (D+M+2)*(4*M))*sizeOfDataType;
+ LOG.debug("Memory required for invoking lstmBackward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors.");
+ }
+
// Transform the input dout and prepare them for cudnnRNNBackwardData
Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
- int size = return_sequences ? N*T*M : N*M;
+ long size = return_sequences ? N*T*M : N*M;
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
- ExecutionConfig.getConfigForSimpleVectorOperations(size),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt(size)),
getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
dy, N, T, M, size, return_sequences ? 1 : 0);
ec.releaseMatrixInputForGPUInstruction(doutName);
// Allocate intermediate pointers computed by forward
Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
- try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
- JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T,
+ try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", toInt(N), toInt(T),
+ toInt(M), toInt(D), true, wPointer)) {
+ JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T),
algo.xDesc, x,
algo.hxDesc, hx,
algo.cxDesc, cx,
@@ -947,7 +959,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
algo.reserveSpace, algo.reserveSpaceSizeInBytes);
Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
- JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T,
+ JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T),
algo.yDesc, yPointer,
// ----------------------
// Additional inputs:
@@ -973,14 +985,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
- ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
smlDx, cudnnDx, N, D, T*D, N*T*D);
ec.releaseMatrixOutputForGPUInstruction(dxName);
gCtx.cudaFreeHelper(instName, cudnnDx, gCtx.EAGER_CUDA_FREE);
// -------------------------------------------------------------------------------------------
Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
- JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T,
+ JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T),
algo.xDesc, x,
algo.hxDesc, hx,
algo.yDesc, yPointer,
@@ -988,7 +1000,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
algo.dwDesc, cudnnDwPointer,
algo.reserveSpace, algo.reserveSpaceSizeInBytes);
LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
- ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+ ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M),
getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
gCtx.cudaFreeHelper(instName, cudnnDwPointer, gCtx.EAGER_CUDA_FREE);
@@ -1242,7 +1254,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
* @return jcuda pointer
* @throws DMLRuntimeException if error occurs while sparse to dense conversion
*/
- public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException {
+ public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, long numRows, long numCols) throws DMLRuntimeException {
long numElems = image.getNumRows()*image.getNumColumns();
if(image.getNumRows() != numRows || image.getNumColumns() != numCols) {
throw new DMLRuntimeException("Expected input of size:[" + numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "].");
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index 7b2c601..a1d799d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -32,6 +32,8 @@ import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -44,6 +46,7 @@ import jcuda.jcudnn.cudnnRNNDescriptor;
import jcuda.jcudnn.cudnnTensorDescriptor;
public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
+ private static final Log LOG = LogFactory.getLog(LibMatrixCuDNNRnnAlgorithm.class.getName());
GPUContext gCtx;
String instName;
cudnnDropoutDescriptor dropoutDesc;
@@ -87,8 +90,11 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
dropOutSizeInBytes = _dropOutSizeInBytes[0];
dropOutStateSpace = new Pointer();
- if (dropOutSizeInBytes != 0)
+ if (dropOutSizeInBytes != 0) {
+ if(LOG.isDebugEnabled())
+ LOG.debug("Allocating " + dropOutSizeInBytes + " bytes for lstm dropout space.");
dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes);
+ }
JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
// Initialize RNN descriptor
@@ -109,55 +115,20 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
// Setup workspace
workSpace = new Pointer(); reserveSpace = new Pointer();
sizeInBytes = getWorkspaceSize(T);
- if(sizeInBytes != 0)
+ if(sizeInBytes != 0) {
+ if(LOG.isDebugEnabled())
+ LOG.debug("Allocating " + sizeInBytes + " bytes for lstm workspace.");
workSpace = gCtx.allocate(instName, sizeInBytes);
+ }
reserveSpaceSizeInBytes = 0;
if(isTraining) {
reserveSpaceSizeInBytes = getReservespaceSize(T);
if (reserveSpaceSizeInBytes != 0) {
+ if(LOG.isDebugEnabled())
+ LOG.debug("Allocating " + reserveSpaceSizeInBytes + " bytes for lstm reserve space.");
reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes);
}
}
- /*
- int numLinearLayers = getNumLinearLayers(rnnMode);
- for(int i = 0; i < numLinearLayers; i++) {
- cudnnFilterDescriptor linLayerMatDesc = new cudnnFilterDescriptor();
- cudnnCreateFilterDescriptor(linLayerMatDesc);
- Pointer linLayerMat = new Pointer();
- JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0,
- xDesc[0], wDesc, w, i, linLayerMatDesc, linLayerMat);
- int[] dataType = new int[] {-1};
- int[] format = new int[] {-1};
- int[] nbDims = new int[] {-1};
- int[] filterDimA = new int[3];
- JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, dataType, format, nbDims, filterDimA);
-
- int filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2];
- double [] tmp = new double[filterDims];
- LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, linLayerMat, tmp, instName, false);
- System.out.println();
- for(int j = 0 ; j < tmp.length; j++) {
- System.out.print(" " + tmp[j]);
- }
- System.out.println();
- LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill",
- org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
- linLayerMat, Math.pow(filterDims, -1), filterDims);
- JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc);
-
- cudnnFilterDescriptor linLayerBiasDesc = new cudnnFilterDescriptor();
- cudnnCreateFilterDescriptor(linLayerBiasDesc);
- Pointer linLayerBias = new Pointer();
- JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0,
- xDesc[0], wDesc, w, i, linLayerBiasDesc, linLayerBias);
- JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, dataType, format, nbDims, filterDimA);
- filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2];
- LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill",
- org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims),
- linLayerBias, Math.pow(filterDims, -1), filterDims);
- JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc);
- }
- */
}
@SuppressWarnings("unused")
@@ -321,5 +292,6 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
throw new RuntimeException(e);
}
}
+ dropOutStateSpace = null;
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 7af164e..25423c1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -516,7 +516,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
//this method is used as a short-hand for all operations that
//guaranteed only deal with dense blocks of a single block.
if( denseBlock != null && denseBlock.numBlocks() > 1 ) {
- throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+") "
+ throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+ ") with "
+ + "dimensions [" + getNumRows() + ", " + getNumColumns() + "] "
+ "allocated but operation access to first block only, which might cause incorrect results.");
}
return (denseBlock != null) ? denseBlock.valuesAt(0) : null;