You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/12 17:38:21 UTC
systemml git commit: [SYSTEMML-445] Change the default CuDNN
algorithm selector for conv2d_backward_data
Repository: systemml
Updated Branches:
refs/heads/master a97bc53f7 -> ea2a6e491
[SYSTEMML-445] Change the default CuDNN algorithm selector for conv2d_backward_data
CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario:
- sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1)
which is 200x slower than CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
Since it is difficult to debug a closed-source method
cudnnGetConvolutionBackwardDataAlgorithm, we will always prefer to use
memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0. We can revisit
this for next CuDNN version. This is not an ideal solution, but the
simplest one I could think of. I welcome discussion on any alternative
solutions you may have. And this goes without saying, we can revisit this
for next CuDNN version.
For more details, please see https://github.com/apache/systemml/pull/682
Closes #682.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ea2a6e49
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ea2a6e49
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ea2a6e49
Branch: refs/heads/master
Commit: ea2a6e4917e85c12784e3794fb5b5da214da8103
Parents: a97bc53
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Oct 12 10:35:27 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Oct 12 10:37:14 2017 -0700
----------------------------------------------------------------------
.../LibMatrixCuDNNConvolutionAlgorithm.java | 101 ++++++++-----------
1 file changed, 43 insertions(+), 58 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/ea2a6e49/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index 871194e..0378c7a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -130,24 +130,17 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
pad_h, pad_w, stride_h, stride_w, P, Q);
- if(workspaceLimit <= 0) {
- // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement,
- // i.e. CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
- ret.algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
- }
- else {
- int[] algos = {-1};
- long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
- cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
- if (sizeInBytesArray[0] != 0)
- ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
- ret.sizeInBytes = sizeInBytesArray[0];
- ret.algo = algos[0];
- }
+ int[] algos = {-1};
+ long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+ jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(LibMatrixCuDNN.getCudnnHandle(gCtx),
+ ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc,
+ cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+ jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+ ret.nchwTensorDesc, ret.filterDesc, ret.convDesc, ret.nkpqTensorDesc, algos[0], sizeInBytesArray);
+ if (sizeInBytesArray[0] != 0)
+ ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+ ret.sizeInBytes = sizeInBytesArray[0];
+ ret.algo = algos[0];
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
return ret;
@@ -182,25 +175,19 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
pad_h, pad_w, stride_h, stride_w, P, Q);
- if(workspaceLimit <= 0) {
- // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement
- // i.e. CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
- ret.algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
- }
- else {
- int[] algos = {-1};
- long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
- LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc,
- cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
- if (sizeInBytesArray[0] != 0)
- ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
- ret.sizeInBytes = sizeInBytesArray[0];
- ret.algo = algos[0];
- }
+ int[] algos = {-1};
+ long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+ jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterAlgorithm(
+ LibMatrixCuDNN.getCudnnHandle(gCtx),
+ ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc,
+ cudnnConvolutionBwdFilterPreference.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+ jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+ ret.nchwTensorDesc, ret.nkpqTensorDesc, ret.convDesc, ret.filterDesc, algos[0], sizeInBytesArray);
+ if (sizeInBytesArray[0] != 0)
+ ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+ ret.sizeInBytes = sizeInBytesArray[0];
+ ret.algo = algos[0];
+
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
return ret;
@@ -235,27 +222,25 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
LibMatrixCuDNNConvolutionAlgorithm ret = new LibMatrixCuDNNConvolutionAlgorithm(gCtx, instName, N, C, H, W, K, R, S,
pad_h, pad_w, stride_h, stride_w, P, Q);
- if(workspaceLimit <= 0) {
- // If overhead is greater than intermediate allocated memory, prefer the cudnn operator with no memory requirement
- // i.e. CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
- ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
- }
- else {
- int[] algos = {-1};
- long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
- LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
- cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
- jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
- ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
- if (sizeInBytesArray[0] != 0)
- ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
- ret.sizeInBytes = sizeInBytesArray[0];
- ret.algo = algos[0];
- }
- if (GPUStatistics.DISPLAY_STATISTICS)
- GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+ // CuDNN's cudnnGetConvolutionBackwardDataAlgorithm returns CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 for atleast one scenario
+ // for sentence CNN (N=1, C=1, H=2060, W=300, F=500, Hf=5, Wf=300, sparsity=0.1).
+ // This causes more than 100x slowdown when compared with CUDNN_CONVOLUTION_BWD_DATA_ALGO_0.
+ // To keep things simple for now, we will always prefer to use memory-less operator: CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
+ ret.algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+// int[] algos = {-1};
+// long sizeInBytesArray[] = {Math.min(workspaceLimit, MAX_WORKSPACE_LIMIT_BYTES)};
+// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataAlgorithm(
+// LibMatrixCuDNN.getCudnnHandle(gCtx),
+// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc,
+// cudnnConvolutionBwdDataPreference.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, sizeInBytesArray[0], algos);
+// jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize(LibMatrixCuDNN.getCudnnHandle(gCtx),
+// ret.filterDesc, ret.nkpqTensorDesc, ret.convDesc, ret.nchwTensorDesc, algos[0], sizeInBytesArray);
+// if (sizeInBytesArray[0] != 0)
+// ret.workSpace = gCtx.allocate(sizeInBytesArray[0]);
+// ret.sizeInBytes = sizeInBytesArray[0];
+// ret.algo = algos[0];
+// if (GPUStatistics.DISPLAY_STATISTICS)
+// GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
return ret;
}