You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/05/04 23:27:43 UTC
incubator-systemml git commit: [HOTFIX] Bug fix for solve,
removed warnings and added instrumentation
Repository: incubator-systemml
Updated Branches:
refs/heads/master 76f3ca5d3 -> 2c5c3b14e
[HOTFIX] Bug fix for solve, removed warnings and added instrumentation
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/2c5c3b14
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/2c5c3b14
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/2c5c3b14
Branch: refs/heads/master
Commit: 2c5c3b14e1906cda70ae1581b19a5e908b3ab329
Parents: 76f3ca5
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu May 4 16:26:47 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu May 4 16:26:47 2017 -0700
----------------------------------------------------------------------
.../instructions/GPUInstructionParser.java | 4 +-
.../gpu/BuiltinBinaryGPUInstruction.java | 2 +
.../instructions/gpu/GPUInstruction.java | 28 ++++---
.../gpu/MatrixMatrixBuiltinGPUInstruction.java | 1 +
.../instructions/gpu/context/GPUContext.java | 2 +
.../instructions/gpu/context/GPUObject.java | 3 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 77 +++++++++++++++-----
7 files changed, 86 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index ef0412c..4a45521 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -35,9 +35,9 @@ import org.apache.sysml.runtime.instructions.gpu.AggregateUnaryGPUInstruction;
public class GPUInstructionParser extends InstructionParser
{
- public static final HashMap<String, GPUINSTRUCTION_TYPE> String2GPUInstructionType;
+ static final HashMap<String, GPUINSTRUCTION_TYPE> String2GPUInstructionType;
static {
- String2GPUInstructionType = new HashMap<String, GPUINSTRUCTION_TYPE>();
+ String2GPUInstructionType = new HashMap<>();
// Neural Network Operators
String2GPUInstructionType.put( "relu_backward", GPUINSTRUCTION_TYPE.Convolution);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
index 372f883..24e9e79 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java
@@ -30,7 +30,9 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
public abstract class BuiltinBinaryGPUInstruction extends GPUInstruction {
+ @SuppressWarnings("unused")
private int _arity;
+
CPOperand output;
CPOperand input1, input2;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index 9eef072..f4c523b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -35,16 +35,20 @@ public abstract class GPUInstruction extends Instruction
public enum GPUINSTRUCTION_TYPE { AggregateUnary, AggregateBinary, Convolution, MMTSJ, Reorg, ArithmeticBinary, BuiltinUnary, BuiltinBinary, Builtin };
// Memory/conversions
- public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host)
- public final static String MISC_TIMER_DEVICE_TO_HOST = "D2H"; // time spent in bringing data from gpu (to host)
- public final static String MISC_TIMER_DEVICE_TO_DEVICE = "D2D"; // time spent in copying data from one region on the device to another
- public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in converting data from sparse to dense
- public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in converting data from dense to sparse
- public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree
- public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu
- public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE)
- public final static String MISC_TIMER_SET_ZERO = "az"; // time spent to allocate
- public final static String MISC_TIMER_REUSE = "r"; // time spent in reusing already allocated memory on GPU (mainly for the count)
+ public final static String MISC_TIMER_HOST_TO_DEVICE = "H2D"; // time spent in bringing data to gpu (from host)
+ public final static String MISC_TIMER_DEVICE_TO_HOST = "D2H"; // time spent in bringing data from gpu (to host)
+ public final static String MISC_TIMER_DEVICE_TO_DEVICE = "D2D"; // time spent in copying data from one region on the device to another
+ public final static String MISC_TIMER_SPARSE_TO_DENSE = "s2d"; // time spent in converting data from sparse to dense
+ public final static String MISC_TIMER_DENSE_TO_SPARSE = "d2s"; // time spent in converting data from dense to sparse
+ public final static String MISC_TIMER_ROW_TO_COLUMN_MAJOR = "r2c"; // time spent in converting data from row major to column major
+ public final static String MISC_TIMER_COLUMN_TO_ROW_MAJOR = "c2r"; // time spent in converting data from column major to row major
+ public final static String MISC_TIMER_OBJECT_CLONE = "clone";// time spent in cloning (deep copying) a GPUObject instance
+
+ public final static String MISC_TIMER_CUDA_FREE = "f"; // time spent in calling cudaFree
+ public final static String MISC_TIMER_ALLOCATE = "a"; // time spent to allocate memory on gpu
+ public final static String MISC_TIMER_ALLOCATE_DENSE_OUTPUT = "ao"; // time spent to allocate dense output (recorded differently than MISC_TIMER_ALLOCATE)
+ public final static String MISC_TIMER_SET_ZERO = "az"; // time spent to allocate
+ public final static String MISC_TIMER_REUSE = "r"; // time spent in reusing already allocated memory on GPU (mainly for the count)
// Matmult instructions
public final static String MISC_TIMER_SPARSE_ALLOCATE_LIB = "Msao"; // time spend in allocating for sparse matrix output
@@ -58,6 +62,10 @@ public abstract class GPUInstruction extends Instruction
// Other BLAS instructions
public final static String MISC_TIMER_DAXPY_LIB = "daxpy"; // time spent in daxpy
+ public final static String MISC_TIMER_QR_BUFFER = "qr_buffer"; // time spent in calculating buffer needed to perform QR
+ public final static String MISC_TIMER_QR = "qr"; // time spent in doing QR
+ public final static String MISC_TIMER_ORMQR = "ormqr"; // time spent in ormqr
+ public final static String MISC_TIMER_TRSM = "trsm"; // time spent in cublas Dtrsm
// Transpose
public final static String MISC_TIMER_SPARSE_DGEAM_LIB = "sdgeaml"; // time spent in sparse transpose (and other ops of type a*op(A) + b*op(B))
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
index f492b6e..8936735 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
@@ -45,6 +45,7 @@ public class MatrixMatrixBuiltinGPUInstruction extends BuiltinBinaryGPUInstructi
MatrixObject mat2 = getMatrixInputForGPUInstruction(ec, input2.getName());
if(opcode.equals("solve")) {
+ ec.setMetaData(output.getName(), mat1.getNumColumns(), 1);
LibMatrixCUDA.solve(ec, ec.getGPUContext(), getExtendedOpcode(), mat1, mat2, output.getName());
} else {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index d71f725..673601f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -307,6 +307,8 @@ public class GPUContext {
freeList = new LinkedList<Pointer>();
freeCUDASpaceMap.put(size, freeList);
}
+ if (freeList.contains(toFree))
+ throw new RuntimeException("GPU : Internal state corrupted, double free");
freeList.add(toFree);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 1d2285d..d735e38 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -26,7 +26,6 @@ import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
import static jcuda.jcusparse.JCusparse.cusparseDnnz;
-import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
@@ -343,7 +342,7 @@ public class GPUObject {
/**
* Convenience method. Converts Column Major Dense Matrix to Row Major Dense Matrix
- * @throws DMLRuntimeException
+ * @throws DMLRuntimeException if error
*/
public void denseColumnMajorToRowMajor() throws DMLRuntimeException {
LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + getGPUContext());
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2c5c3b14/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 23304b5..a99571a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -329,6 +329,7 @@ public class LibMatrixCUDA {
* @return a sparse matrix pointer
* @throws DMLRuntimeException if error occurs
*/
+ @SuppressWarnings("unused")
private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
if(!isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).denseToSparse();
@@ -2754,6 +2755,25 @@ public class LibMatrixCUDA {
Pointer betaPtr = pointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+ int lda = (int) in1.getNumColumns();
+ int ldb = (int) in2.getNumColumns();
+ int m = (int) in1.getNumColumns();
+ int n = (int) in2.getNumRows();
+ if (isLeftTransposed && isRightTransposed) {
+ m = (int) in1.getNumRows();
+ n = (int) in2.getNumColumns();
+ }
+ else if (isLeftTransposed) {
+ m = (int) in1.getNumRows();
+ } else if (isRightTransposed) {
+ n = (int) in2.getNumColumns();
+ }
+ int ldc = m;
+
+
+
+ /**
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
if(!isLeftTransposed && isRightTransposed) {
@@ -2763,6 +2783,7 @@ public class LibMatrixCUDA {
int lda = isLeftTransposed ? n : m;
int ldb = isRightTransposed ? n : m;
int ldc = m;
+ **/
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(gCtx, in1);
@@ -2963,8 +2984,10 @@ public class LibMatrixCUDA {
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
// x = solve(A, b)
+ LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
+
+ long t0 = -1;
- // Both Sparse
if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // Both dense
GPUObject Aobj = in1.getGPUObject(gCtx);
GPUObject bobj = in2.getGPUObject(gCtx);
@@ -2980,55 +3003,75 @@ public class LibMatrixCUDA {
// convert dense matrices to row major
// Operation in cuSolver and cuBlas are for column major dense matrices
// and are destructive to the original input
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
GPUObject ATobj = (GPUObject) Aobj.clone();
- ATobj.denseRowMajorToColumnMajor();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ ATobj.denseRowMajorToColumnMajor();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
Pointer A = ATobj.getJcudaDenseMatrixPtr();
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
GPUObject bTobj = (GPUObject) bobj.clone();
- bTobj.denseRowMajorToColumnMajor();
- Pointer b = bTobj.getJcudaDenseMatrixPtr();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ bTobj.denseRowMajorToColumnMajor();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
+
+ Pointer b = bTobj.getJcudaDenseMatrixPtr();
// The following set of operations is done following the example in the cusolver documentation
// http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
// step 3: query working space of geqrf and ormqr
- int[] lwork = {0};
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ int[] lwork = {0};
JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
+
- // step 4: compute QR factorization
- Pointer work = gCtx.allocate(lwork[0] * Sizeof.DOUBLE);
- Pointer tau = gCtx.allocate(Math.max(m, m) * Sizeof.DOUBLE);
+ // step 4: compute QR factorization
+ Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
+ Pointer tau = gCtx.allocate(instName, Math.max(m, m) * Sizeof.DOUBLE);
Pointer devInfo = gCtx.allocate(Sizeof.INT);
- JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
- int[] qrError = {-1};
+
+ int[] qrError = {-1};
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 5: compute Q^T*B
- JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
- cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
+ cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
if (qrError[0] != 0) {
throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
}
// step 6: compute x = R \ Q^T*B
- JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
n, 1, pointerTo(1.0), A, m, b, m);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
- bTobj.denseColumnMajorToRowMajor();
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ bTobj.denseColumnMajorToRowMajor();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
// TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
// There is an avoidable copy happening here
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
- gCtx.cudaFreeHelper(work);
- gCtx.cudaFreeHelper(tau);
- gCtx.cudaFreeHelper(tau);
+ gCtx.cudaFreeHelper(instName, work);
+ gCtx.cudaFreeHelper(instName, tau);
ATobj.clearData();
bTobj.clearData();