You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/09/13 18:22:02 UTC

systemml git commit: [SYSTEMML-445] Removed unnecessary long-to-int conversion in LSTM

Repository: systemml
Updated Branches:
  refs/heads/master 77c98d693 -> e2dc85688


[SYSTEMML-445] Removed unnecessary long-to-int conversion in LSTM

- Minor cleanup of the GPUObject class.
- Also, fixed incorrect forced GPU configuration flag.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e2dc8568
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e2dc8568
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e2dc8568

Branch: refs/heads/master
Commit: e2dc8568855d353265ac4e0755b9ac3d2b30b1d8
Parents: 77c98d6
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 13 11:17:33 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 13 11:17:33 2018 -0700

----------------------------------------------------------------------
 .../apache/sysml/conf/ConfigurationManager.java |  2 +-
 .../instructions/gpu/DnnGPUInstruction.java     | 20 +++---
 .../instructions/gpu/context/CSRPointer.java    |  8 ---
 .../gpu/context/ExecutionConfig.java            |  4 +-
 .../gpu/context/GPUMemoryManager.java           | 12 +++-
 .../instructions/gpu/context/GPUObject.java     | 72 ++++++++++----------
 .../runtime/matrix/data/LibMatrixCuDNN.java     | 38 +++++++----
 .../matrix/data/LibMatrixCuDNNRnnAlgorithm.java | 56 ++++-----------
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  3 +-
 9 files changed, 100 insertions(+), 115 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
index d9f1906..96c3885 100644
--- a/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
+++ b/src/main/java/org/apache/sysml/conf/ConfigurationManager.java
@@ -258,7 +258,7 @@ public class ConfigurationManager
 	 * @return true if GPU is enabled in forced mode
 	 */
 	public static boolean isForcedGPU() {
-		return _ldmlOptions.get().isGPU();
+		return _ldmlOptions.get().isForceGPU();
 	}
 	
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
index d620de9..6094b6c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/DnnGPUInstruction.java
@@ -595,18 +595,18 @@ public class DnnGPUInstruction extends GPUInstruction {
 	
 	private void processLstmBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException {
 		MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName());
-		int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M)
+		long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M)
 		Pointer out0Pointer =  LibMatrixCUDA.getDensePointer(gCtx, out0, instName);
 		
 		MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName());
 		MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName());
 		long numRowsW = W.getNumRows();
-		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
+		long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures 
 		Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
 		Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
 		Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
-				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+				ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
 				sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M);
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -619,7 +619,7 @@ public class DnnGPUInstruction extends GPUInstruction {
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
 		Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
-				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+				ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		
@@ -656,18 +656,19 @@ public class DnnGPUInstruction extends GPUInstruction {
 		// previous output out0 (also represented by hx) and cell state c0 (also represented by cx): (N, M) ==> (1, M, N)
 		// out: (N, T*M) or (N, M) ==> (T, M, N)
 		MatrixObject out0 = getMatrixInputForGPUInstruction(ec, _input4.getName());
-		int M = toInt(out0.getNumColumns()); // hiddenSize .. since out0: (N, M)
+		long M = out0.getNumColumns(); // hiddenSize .. since out0: (N, M)
 		Pointer out0Pointer =  LibMatrixCUDA.getDensePointer(gCtx, out0, instName);
 		
 		MatrixObject W = getMatrixInputForGPUInstruction(ec, _input2.getName());
 		MatrixObject bias = getMatrixInputForGPUInstruction(ec, _input3.getName());
 		long numRowsW = W.getNumRows();
-		int D = toInt(numRowsW) - M; // since W:(D+M, 4M) ... numFeatures 
+		long D = numRowsW - M; // since W:(D+M, 4M) ... numFeatures
+		
 		Pointer sysmlWPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, W, instName, D+M, 4*M);
 		Pointer sysmlBiasPointer = LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, bias, instName, 1, 4*M);
 		Pointer cudnnWPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_weight",
-				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+				ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
 				sysmlWPointer, sysmlBiasPointer, cudnnWPointer, D, M);
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input3.getName());
@@ -682,13 +683,14 @@ public class DnnGPUInstruction extends GPUInstruction {
 		int T = toInt(numColsX/ D); // since X:(N, T*D) ... seqLength
 		Pointer cudnnInput = gCtx.allocate(instName, (N*T*D)*LibMatrixCUDA.sizeOfDataType);
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_input",
-				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+				ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
 				xPointer, cudnnInput, N, D, T*D, N*T*D);
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		
 		Pointer c0Pointer = LibMatrixCUDA.getDensePointer(gCtx, getMatrixInputForGPUInstruction(ec, _input5.getName()), instName); 
 		
-		LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), N, M, D, T);
+		LibMatrixCuDNN.lstm(ec, gCtx, instName, cudnnInput, cudnnWPointer, out0Pointer, c0Pointer, return_sequences, _output.getName(), _output2.getName(), 
+				toInt(N), toInt(M), toInt(D), toInt(T));
 		gCtx.cudaFreeHelper(instName, cudnnWPointer, gCtx.EAGER_CUDA_FREE);
 		gCtx.cudaFreeHelper(instName, cudnnInput, gCtx.EAGER_CUDA_FREE);
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index d7e38b9..135e0b1 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -476,14 +476,6 @@ public class CSRPointer {
 	}
 
 	/**
-	 * Calls cudaFree lazily on the allocated {@link Pointer} instances
-	 *
-	 */
-	public void deallocate() {
-		deallocate(getGPUContext().EAGER_CUDA_FREE);
-	}
-
-	/**
 	 * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
 	 *
 	 * @param eager whether to do eager or lazy cudaFrees

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 872fef7..d35e813 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -68,8 +68,8 @@ public class ExecutionConfig {
 	 * @return execution configuration
 	 */
 	public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) {
-		if(numCells == 0)
-			throw new DMLRuntimeException("Attempting to invoke a kernel with 0 threads");
+		if(numCells <= 0)
+			throw new DMLRuntimeException("Attempting to invoke a kernel with " + numCells + " threads");
 		int deviceNumber = 0;
 		int blockDimX = getMaxBlockDim(deviceNumber);
 		int gridDimX = (int) Math.ceil((double) numCells / blockDimX);

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index e01c71a..509aafe 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -53,7 +53,7 @@ public class GPUMemoryManager {
 	// Developer flag: Use this flag to check for GPU memory leak in SystemML.  
 	// This has an additional overhead of maintaining stack trace of all the allocated GPU pointers via PointerInfo class.
 	private static final boolean DEBUG_MEMORY_LEAK = false;
-	private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10}; // Avoids printing too much text while debuggin
+	private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
 	
 	private final boolean PRINT_GPU_MEMORY_INFO = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
 	
@@ -86,7 +86,15 @@ public class GPUMemoryManager {
 	private Set<Pointer> getNonMatrixLockedPointers() {
 		Set<Pointer> managedPointers = matrixMemoryManager.getPointers();
 		managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers());
-		return nonIn(allPointers.keySet(), managedPointers);
+		Set<Pointer> superSet = allPointers.keySet();
+		Set<Pointer> ret = nonIn(superSet, managedPointers);
+		if(DEBUG_MEMORY_LEAK) {
+			System.out.println(
+				ret.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + " = notIn(>>>" + 
+				superSet.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>, <<<" + 
+				managedPointers.stream().map(p -> p.toString()).collect(Collectors.joining(",")) + ">>>)");
+		}
+		return ret;
 	}
 	
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index cfab0d4..1564f48 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -91,11 +91,6 @@ public class GPUObject {
 	AtomicLong timestamp = new AtomicLong();
 
 	/**
-	 * Whether this block is in sparse format
-	 */
-	protected boolean isSparse = false;
-
-	/**
 	 * Enclosing {@link MatrixObject} instance
 	 */
 	MatrixObject mat = null;
@@ -131,10 +126,29 @@ public class GPUObject {
 	
 	/**
 	 * Removes the dense pointer and potential soft reference
+	 * 
+	 * @param opcode opcode of the instruction
+	 * @param eager whether to delete eagerly
 	 */
-	public void clearDensePointer() {
-		jcudaDenseMatrixPtr = null;
+	public void clearDensePointer(String opcode, boolean eager) {
+		if (!isDensePointerNull()) {
+			getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
+		}
 		shadowBuffer.clearShadowPointer();
+		jcudaDenseMatrixPtr = null;
+	}
+	
+	/**
+	 * Removes the sparse pointer
+	 * 
+	 * @param opcode opcode of the instruction
+	 * @param eager whether to delete eagerly
+	 */
+	public void clearSparsePointer(String opcode, boolean eager) {
+		if (getJcudaSparseMatrixPtr() != null) {
+			getJcudaSparseMatrixPtr().deallocate(eager);
+		}
+		jcudaSparseMatrixPtr = null;
 	}
 	
 	
@@ -147,14 +161,14 @@ public class GPUObject {
 		if (!this.isDensePointerNull()) {
 			throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
 		}
+		clearSparsePointer(null, true);
 		this.jcudaDenseMatrixPtr = densePtr;
-		this.isSparse = false;
 		if(LOG.isDebugEnabled()) {
 			LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
 		}
-		if (getJcudaSparseMatrixPtr() != null) {
-			getJcudaSparseMatrixPtr().deallocate();
-			jcudaSparseMatrixPtr = null;
+		if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) {
+			// Double-check if the matrix manager still has the current GPU object in case of eviction.
+			gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
 		}
 	}
 	// ----------------------------------------------------------------------
@@ -170,7 +184,6 @@ public class GPUObject {
 		that.writeLock = false;
 		
 		that.timestamp = new AtomicLong(me.timestamp.get());
-		that.isSparse = me.isSparse;
 
 		try {
 			if (!me.isDensePointerNull()) {
@@ -197,10 +210,6 @@ public class GPUObject {
 		return getGPUContext().allocate(null, size);
 	}
 
-	private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
-		getGPUContext().cudaFreeHelper(null, toFree, gpuContext.EAGER_CUDA_FREE);
-	}
-
 	public GPUContext getGPUContext() {
 		return gpuContext;
 	}
@@ -300,11 +309,11 @@ public class GPUObject {
 		if (this.jcudaSparseMatrixPtr != null) {
 			throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
 		}
+		clearDensePointer(null, true);
 		this.jcudaSparseMatrixPtr = sparseMatrixPtr;
-		this.isSparse = true;
-		if (!isDensePointerNull() && !shadowBuffer.isBuffered()) {
-			cudaFreeHelper(getDensePointer());
-			clearDensePointer();
+		if(!gpuContext.getMemoryManager().getGPUMatrixMemoryManager().gpuObjects.contains(this)) {
+			// Double-check if the matrix manager still has the current GPU object in case of eviction.
+			gpuContext.getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(this);
 		}
 	}
 	
@@ -354,8 +363,7 @@ public class GPUObject {
 		}
 
 		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
-		cudaFreeHelper(getDensePointer());
-		clearDensePointer();
+		clearDensePointer(null, true);
 		setDensePointer(tmp);
 	}
 
@@ -376,8 +384,7 @@ public class GPUObject {
 		}
 
 		Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc);
-		cudaFreeHelper(getDensePointer());
-		clearDensePointer();
+		clearDensePointer(null, true);
 		setDensePointer(tmp);
 	}
 
@@ -446,7 +453,7 @@ public class GPUObject {
 	}
 
 	public boolean isSparse() {
-		return isSparse;
+		return jcudaSparseMatrixPtr != null;
 	}
 	
 	private static long getDatatypeSizeOf(long numElems) {
@@ -602,7 +609,6 @@ public class GPUObject {
 			LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + getGPUContext());
 		}
 		boolean allocated = false;
-		isSparse = true;
 		if (!isAllocated()) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
@@ -995,22 +1001,15 @@ public class GPUObject {
 	 * Clears the data associated with this {@link GPUObject} instance
 	 *
 	 * @param opcode opcode of the instruction
-	 * @param eager whether to be done synchronously or asynchronously
+	 * @param eager whether to delete eagerly
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public void clearData(String opcode, boolean eager) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext());
 		}
-		if (!isDensePointerNull()) {
-			getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager);
-		}
-		if (getJcudaSparseMatrixPtr() != null) {
-			getJcudaSparseMatrixPtr().deallocate(eager);
-		}
-		clearDensePointer();
-		shadowBuffer.clearShadowPointer();
-		jcudaSparseMatrixPtr = null;
+		clearDensePointer(opcode, eager);
+		clearSparsePointer(opcode, eager);
 		resetReadWriteLock();
 		getGPUContext().getMemoryManager().removeGPUObject(this);
 	}
@@ -1039,7 +1038,6 @@ public class GPUObject {
 		sb.append(", dirty=").append(dirty);
 		sb.append(", readLocks=").append(readLocks.longValue());
 		sb.append(", writeLock=").append(writeLock);
-		sb.append(", sparse? ").append(isSparse);
 		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
 		if(!isDensePointerNull())
 			sb.append(", densePtr=").append(getDensePointer());

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index e7955e1..8051cbc 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -849,14 +849,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	static Pointer getDenseInputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String inputName,
 			long numRows, long numCols) throws DMLRuntimeException {
 		MatrixObject output = ec.getMatrixInputForGPUInstruction(inputName, instName);
-		return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
+		return LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols);
 	}
 	
 	static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName,
 			long numRows, long numCols) throws DMLRuntimeException {
 		MatrixObject output = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix
-		return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols));
+		return getDensePointerForCuDNN(gCtx, output, instName, numRows, numCols);
 	}
 	
 	/**
@@ -890,9 +890,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			String outputName, String cyName,  					 // output
 			String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
 		boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
+		if(LOG.isDebugEnabled()) {
+			long memRequired = (N*T*M + 2*N*M + N*T*M)*sizeOfDataType;
+			LOG.debug("Memory required for invoking lstmForward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors.");
+		}
+		
 		// Get output pointers
 		Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
-		Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
+		Pointer hyPointer = return_sequences ? gCtx.allocate(instName, N*M*sizeOfDataType) : getDenseOutputPointer(ec, gCtx, instName, outputName, N, M);
 		Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
 		// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
 		
@@ -922,20 +927,27 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
 			Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
 			String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
-			boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
+			boolean return_sequences, long N, long M, long D, long T) throws DMLRuntimeException {
+		
+		if(LOG.isDebugEnabled()) {
+			long memRequired = (N*T*M + (return_sequences ? T*M : M) + N*T*M + 2*N*T*D + (D+M+2)*(4*M))*sizeOfDataType;
+			LOG.debug("Memory required for invoking lstmBackward is " + memRequired + " bytes + workspace + reserve space + memory for descriptors.");
+		}
+		
 		// Transform the input dout and prepare them for cudnnRNNBackwardData
 		Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
-		int size = return_sequences ? N*T*M : N*M;
+		long size = return_sequences ? N*T*M : N*M;
 		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
-				ExecutionConfig.getConfigForSimpleVectorOperations(size),
+				ExecutionConfig.getConfigForSimpleVectorOperations(toInt(size)),
 				getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
 				dy, N, T, M, size, return_sequences ? 1 : 0);
 		ec.releaseMatrixInputForGPUInstruction(doutName);
 				
 		// Allocate intermediate pointers computed by forward
 		Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
-		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
-			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
+		try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", toInt(N), toInt(T), 
+				toInt(M), toInt(D), true, wPointer)) {
+			JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), 
 					algo.xDesc, x, 
 					algo.hxDesc, hx, 
 					algo.cxDesc, cx, 
@@ -947,7 +959,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 			
 			Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
-			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
+			JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), 
 					algo.yDesc, yPointer,
 					// ----------------------
 					// Additional inputs:
@@ -973,14 +985,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			
 			Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
-					ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
+					ExecutionConfig.getConfigForSimpleVectorOperations(toInt(N*T*D)),
 					smlDx, cudnnDx, N, D, T*D, N*T*D);
 			ec.releaseMatrixOutputForGPUInstruction(dxName);
 			gCtx.cudaFreeHelper(instName, cudnnDx, gCtx.EAGER_CUDA_FREE);
 			
 			// -------------------------------------------------------------------------------------------
 			Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
-			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
+			JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, toInt(T), 
 					algo.xDesc, x, 
 					algo.hxDesc, hx, 
 					algo.yDesc, yPointer, 
@@ -988,7 +1000,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					algo.dwDesc, cudnnDwPointer, 
 					algo.reserveSpace, algo.reserveSpaceSizeInBytes);
 			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
-					ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
+					ExecutionConfig.getConfigForSimpleVectorOperations(toInt((D+M+2)*(4*M))),
 					getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
 					getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
 			gCtx.cudaFreeHelper(instName, cudnnDwPointer, gCtx.EAGER_CUDA_FREE);
@@ -1242,7 +1254,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 * @return jcuda pointer
 	 * @throws DMLRuntimeException if error occurs while sparse to dense conversion
 	 */
-	public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException {
+	public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, long numRows, long numCols) throws DMLRuntimeException {
 		long numElems = image.getNumRows()*image.getNumColumns();
 		if(image.getNumRows() != numRows || image.getNumColumns() != numCols) {
 			throw new DMLRuntimeException("Expected input of size:[" +  numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "]."); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
index 7b2c601..a1d799d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNRnnAlgorithm.java
@@ -32,6 +32,8 @@ import static jcuda.jcudnn.cudnnRNNInputMode.CUDNN_LINEAR_INPUT;
 import static jcuda.jcudnn.cudnnDirectionMode.CUDNN_UNIDIRECTIONAL;
 import static jcuda.jcudnn.cudnnRNNAlgo.CUDNN_RNN_ALGO_STANDARD;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
@@ -44,6 +46,7 @@ import jcuda.jcudnn.cudnnRNNDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
 
 public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
+	private static final Log LOG = LogFactory.getLog(LibMatrixCuDNNRnnAlgorithm.class.getName());
 	GPUContext gCtx;
 	String instName;
 	cudnnDropoutDescriptor dropoutDesc;
@@ -87,8 +90,11 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 		JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
 		dropOutSizeInBytes = _dropOutSizeInBytes[0];
 		dropOutStateSpace = new Pointer();
-		if (dropOutSizeInBytes != 0)
+		if (dropOutSizeInBytes != 0) {
+			if(LOG.isDebugEnabled()) 
+				LOG.debug("Allocating " +  dropOutSizeInBytes + " bytes for lstm dropout space.");
 			dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes);
+		}
 		JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
 		
 		// Initialize RNN descriptor
@@ -109,55 +115,20 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 		// Setup workspace
 		workSpace = new Pointer(); reserveSpace = new Pointer();
 		sizeInBytes = getWorkspaceSize(T);
-		if(sizeInBytes != 0)
+		if(sizeInBytes != 0) {
+			if(LOG.isDebugEnabled()) 
+				LOG.debug("Allocating " +  sizeInBytes + " bytes for lstm workspace.");
 			workSpace = gCtx.allocate(instName, sizeInBytes);
+		}
 		reserveSpaceSizeInBytes = 0;
 		if(isTraining) {
 			reserveSpaceSizeInBytes = getReservespaceSize(T);
 			if (reserveSpaceSizeInBytes != 0) {
+				if(LOG.isDebugEnabled()) 
+					LOG.debug("Allocating " +  reserveSpaceSizeInBytes + " bytes for lstm reserve space.");
 				reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes);
 			}
 		}
-		/*
-		int numLinearLayers = getNumLinearLayers(rnnMode); 
-		for(int i = 0; i < numLinearLayers; i++) {
-			cudnnFilterDescriptor  linLayerMatDesc = new cudnnFilterDescriptor();
-			cudnnCreateFilterDescriptor(linLayerMatDesc);
-			Pointer linLayerMat = new Pointer();
-			JCudnn.cudnnGetRNNLinLayerMatrixParams(gCtx.getCudnnHandle(), rnnDesc, 0, 
-					xDesc[0], wDesc, w, i, linLayerMatDesc, linLayerMat);
-			int[] dataType = new int[] {-1};
-			int[] format = new int[] {-1};
-			int[] nbDims = new int[] {-1};
-			int[] filterDimA = new int[3];
-			JCudnn.cudnnGetFilterNdDescriptor(linLayerMatDesc, 3, dataType, format, nbDims, filterDimA);
-			
-			int filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2];
-			double [] tmp = new double[filterDims];
-			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(gCtx, linLayerMat, tmp, instName, false);
-			System.out.println();
-			for(int j = 0 ; j < tmp.length; j++) {
-				System.out.print(" " + tmp[j]);
-			}
-			System.out.println();
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
-					org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), 
-					linLayerMat, Math.pow(filterDims, -1), filterDims);
-			JCudnn.cudnnDestroyFilterDescriptor(linLayerMatDesc);
-			
-			cudnnFilterDescriptor  linLayerBiasDesc = new cudnnFilterDescriptor();
-			cudnnCreateFilterDescriptor(linLayerBiasDesc);
-			Pointer linLayerBias = new Pointer();
-			JCudnn.cudnnGetRNNLinLayerBiasParams(gCtx.getCudnnHandle(), rnnDesc, 0,
-					xDesc[0], wDesc, w, i, linLayerBiasDesc, linLayerBias);
-			JCudnn.cudnnGetFilterNdDescriptor(linLayerBiasDesc, 3, dataType, format, nbDims, filterDimA);
-			filterDims = filterDimA[0] * filterDimA[1] * filterDimA[2];
-			LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("fill", 
-					org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig.getConfigForSimpleVectorOperations(filterDims), 
-					linLayerBias, Math.pow(filterDims, -1), filterDims);
-			JCudnn.cudnnDestroyFilterDescriptor(linLayerBiasDesc);
-		}
-		*/
 	}
 	
 	@SuppressWarnings("unused")
@@ -321,5 +292,6 @@ public class LibMatrixCuDNNRnnAlgorithm implements java.lang.AutoCloseable {
 				throw new RuntimeException(e);
 			}
 		}
+		dropOutStateSpace = null;
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/e2dc8568/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 7af164e..25423c1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -516,7 +516,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		//this method is used as a short-hand for all operations that
 		//guaranteed only deal with dense blocks of a single block.
 		if( denseBlock != null && denseBlock.numBlocks() > 1 ) {
-			throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+") "
+			throw new RuntimeException("Large dense in-memory block (with numblocks="+denseBlock.numBlocks()+ ") with "
+				+ "dimensions [" + getNumRows() + ", " + getNumColumns() + "] "
 				+ "allocated but operation access to first block only, which might cause incorrect results.");
 		}
 		return (denseBlock != null) ? denseBlock.valuesAt(0) : null;