You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/28 20:15:38 UTC
systemml git commit: [MINOR] [SYSTEMML-446] Added time spent in jcuda
sync to fine-grained statistics
Repository: systemml
Updated Branches:
refs/heads/master 61dcc85e4 -> 0cb2f7f68
[MINOR] [SYSTEMML-446] Added time spent in jcuda sync to fine-grained statistics
- Also added force accelerator flag to LibMatrixCuDNN to skip worst-case memory
budget restriction.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0cb2f7f6
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0cb2f7f6
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0cb2f7f6
Branch: refs/heads/master
Commit: 0cb2f7f68cb644c7fda6666bc84782e82069fb34
Parents: 61dcc85
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 28 12:14:28 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 28 13:14:28 2017 -0700
----------------------------------------------------------------------
.../instructions/gpu/GPUInstruction.java | 7 +++++-
.../runtime/matrix/data/LibMatrixCuDNN.java | 26 ++++++++++++--------
2 files changed, 22 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index bc3ba9b..108a622 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -61,7 +61,8 @@ public abstract class GPUInstruction extends Instruction {
public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR = "r2c"; // time spent in converting data from row major to column major
public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR = "c2r"; // time spent in converting data from column major to row major
public final static String MISC_TIMER_OBJECT_CLONE = "clone";// time spent in cloning (deep copying) a GPUObject instance
-
+ public final static String MISC_TIMER_CUDA_SYNC = "sync"; // time spent in device sync
+
public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree
public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu
public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ad"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE)
@@ -198,7 +199,11 @@ public abstract class GPUInstruction extends Instruction {
throws DMLRuntimeException
{
if(DMLScript.SYNCHRONIZE_GPU) {
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
jcuda.runtime.JCuda.cudaDeviceSynchronize();
+ if(GPUStatistics.DISPLAY_STATISTICS) {
+ GPUStatistics.maintainCPMiscTimes(getExtendedOpcode(), GPUInstruction.MISC_TIMER_CUDA_SYNC, System.nanoTime() - t0);
+ }
}
if(LOG.isDebugEnabled()) {
for(GPUContext gpuCtx : ec.getGPUContexts()) {
http://git-wip-us.apache.org/repos/asf/systemml/blob/0cb2f7f6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 602edce..654bd9d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -67,6 +67,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -153,7 +154,8 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
- if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+ if(DMLScript.FORCE_ACCELERATOR ||
+ (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
// Filter and output are accounted as dense in the memory estimation for conv2d
double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
overhead += isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -161,7 +163,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);
Pointer dstPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
- if(overhead <= intermediateMemoryBudget) {
+ if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) {
// Perform all-input all-channel conv2d
Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
cudnnConv2d(gCtx, instName, imagePointer, filterPointer, dstPointer, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
@@ -346,11 +348,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
- if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+ if(DMLScript.FORCE_ACCELERATOR ||
+ (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
double overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
- if(overhead <= intermediateMemoryBudget) {
+ if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) {
// Perform all-input all-channel conv2dBackwardFilter
Pointer imagePointer = getDensePointerForCuDNN(gCtx, image, instName);
Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
@@ -502,13 +505,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S;
long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
- if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+ if(DMLScript.FORCE_ACCELERATOR ||
+ (NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor)) {
// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
Pointer filterPointer = getDensePointerForCuDNN(gCtx, filter, instName);
Pointer dstPointer = getDensePointerForCuDNN(gCtx, output, instName);
- if(overhead <= intermediateMemoryBudget) {
+ if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) {
// Perform all-input all-channel conv2dBackwardData
Pointer doutPointer = getDensePointerForCuDNN(gCtx, dout, instName);
cudnnConv2dBackwardData(gCtx, instName, filterPointer, doutPointer, dstPointer,
@@ -638,11 +642,12 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
long CHW = C*H*W; long CPQ = C*P*Q;
long NCHW = N*CHW; long NCPQ = N*CPQ;
- if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+ if(DMLScript.FORCE_ACCELERATOR ||
+ (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor)) {
// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName);
- if(overhead <= intermediateMemoryBudget) {
+ if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) {
Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
cudnnMaxpooling(gCtx, instName, x, xDesc, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
@@ -780,12 +785,13 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
long CHW = C*H*W; long CPQ = C*P*Q;
long NCHW = N*CHW; long NCPQ = N*CPQ;
- if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+ if(DMLScript.FORCE_ACCELERATOR ||
+ (NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor)) {
// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
Pointer dx = getDensePointerForCuDNN(gCtx, outputBlock, instName);
- if(overhead <= intermediateMemoryBudget) {
+ if(DMLScript.FORCE_ACCELERATOR || overhead <= intermediateMemoryBudget) {
Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName);
cudnnMaxpoolingBackward(gCtx, instName, x, dy, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);