You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/12 17:38:21 UTC

systemml git commit: [SYSTEMML-445] Change the default CuDNN algorithm selector for conv2d_backward_data

Repository: systemml
Updated Branches:
  refs/heads/master a97bc53f7 -> ea2a6e491


[SYSTEMML-445] Change the default CuDNN algorithm selector for conv2d_backward_data

CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario:
- sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1)
which is 200x slower than CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.

Since it is difficult to debug a closed-source method
cudnnGetConvolutionBackwardDataAlgorithm, we will always prefer to use
memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. We can revisit
this for next CuDNN version. This is not an ideal solution, but the
simplest one I could think of. I welcome discussion on any alternative
solutions you may have. And this goes without saying, we can revisit this
for next CuDNN version.

For more details, please see https://github.com/apache/systemml/pull/682

Closes #682.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ea2a6e49
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ea2a6e49
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ea2a6e49

Branch: refs/heads/master
Commit: ea2a6e4917e85c12784e3794fb5b5da214da8103
Parents: a97bc53
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Oct 12 10:35:27 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Oct 12 10:37:14 2017 -0700

----------------------------------------------------------------------
 .../LibMatrixCuDNNConvolutionAlgorithm.java     | 101 ++++++++-----------
 1 file changed, 43 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ea2a6e49/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 871194e..0378c7a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -130,24 +130,17 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 		long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
 				pad_h, pad_w, stride_h, stride_w, P, Q);
-		if(workspaceLimit <= 0) {
-			// If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement, 
-			// i.e. CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
-			ret.algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-		}
-		else {
-			int[] algos = {-1};
-			long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
-					cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
-			if (sizeInBytesArray[0] != 0)
-				ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
-			ret.sizeInBytes = sizeInBytesArray[0];
-			ret.algo = algos[0];
-		}
+		int[] algos = {-1};
+		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx), 
+				ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
+				cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
+				ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
+		if (sizeInBytesArray[0] != 0)
+			ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+		ret.sizeInBytes = sizeInBytesArray[0];
+		ret.algo = algos[0];
 		if (GPUStatistics.DISPLAY_STATISTICS)
 			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 		return ret;
@@ -182,25 +175,19 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		
-		if(workspaceLimit <= 0) {
-			// If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement
-			// i.e. CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
-			ret.algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-		}
-		else {
-			int[] algos = {-1};
-			long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
-					LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, 
-					cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
-			if (sizeInBytesArray[0] != 0)
-				ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
-			ret.sizeInBytes = sizeInBytesArray[0];
-			ret.algo = algos[0];
-		}
+		int[] algos = {-1};
+		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
+				LibMatrixCuDNN.getCudnnHandle(gCtx), 
+				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, 
+				cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
+				ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
+		if (sizeInBytesArray[0] != 0)
+			ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+		ret.sizeInBytes = sizeInBytesArray[0];
+		ret.algo = algos[0];
+		
 		if (GPUStatistics.DISPLAY_STATISTICS)
 			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 		return ret;
@@ -235,27 +222,25 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 		LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S, 
 				pad_h, pad_w, stride_h, stride_w, P, Q);
 		
-		if(workspaceLimit <= 0) {
-			// If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement
-			// i.e. CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
-			ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-		}
-		else {
-			int[] algos = {-1};
-			long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
-					LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
-					cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
-			jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
-					ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
-			if (sizeInBytesArray[0] != 0)
-				ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
-			ret.sizeInBytes = sizeInBytesArray[0];
-			ret.algo = algos[0];
-		}
-		if (GPUStatistics.DISPLAY_STATISTICS)
-			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+		// CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario 
+		// for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1).
+		// This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
+		// To keep things simple for now, we will always prefer to use memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
+		ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+//		int[] algos = {-1};
+//		long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+//		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
+//				LibMatrixCuDNN.getCudnnHandle(gCtx), 
+//				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
+//				cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+//		jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx), 
+//				ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
+//		if (sizeInBytesArray[0] != 0)
+//			ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+//		ret.sizeInBytes = sizeInBytesArray[0];
+//		ret.algo = algos[0];
+//		if (GPUStatistics.DISPLAY_STATISTICS)
+//			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
 		return ret;
 	}