You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/08/24 21:41:32 UTC
[2/5] systemml git commit: [SYSTEMML-1793] Support matrix range
indexing on GPU
http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 62c0e0d..92a5546 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -112,6 +112,7 @@ import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysml.runtime.util.IndexRange;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.Statistics;
@@ -148,7 +149,7 @@ public class LibMatrixCUDA {
private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
- // Assume Compute Capability 3.0
+ // Assume Compute Capability 3.0
// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
// MAX_THREADS is 1024 For compute capability > 3.0
private static int _MAX_THREADS = -1;
@@ -163,7 +164,7 @@ public class LibMatrixCUDA {
static GPUContext gCtx throws DMLRuntimeException {
return GPUContext.gCtx;
}
- */
+ */
/**
* Utility function to get maximum number of threads supported by the active CUDA device.
@@ -336,7 +337,6 @@ public class LibMatrixCUDA {
* @return a sparse matrix pointer
* @throws DMLRuntimeException if error occurs
*/
- @SuppressWarnings("unused")
private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException {
if(!isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).denseToSparse();
@@ -405,7 +405,7 @@ public class LibMatrixCUDA {
biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, (int)k1);
cudaFreeHelper(tmp);
- */
+ */
LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx);
conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
//cudaDeviceSynchronize;
@@ -413,7 +413,7 @@ public class LibMatrixCUDA {
}
public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
- int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+ int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
throws DMLRuntimeException {
Pointer imagePointer = getDensePointer(gCtx, image, true, instName);
Pointer filterPointer = getDensePointer(gCtx, filter, true, instName);
@@ -448,10 +448,10 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if error
*/
public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N,
- int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
+ int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
throws DMLRuntimeException {
- LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
- cudnnFilterDescriptor filterDesc = null;
+ LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx);
+ cudnnFilterDescriptor filterDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
@@ -480,7 +480,7 @@ public class LibMatrixCUDA {
// Also ensure that GPU has enough memory to allocate memory
long sizeInBytesArray[] = {0};
jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
- CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
+ CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray);
if (sizeInBytesArray[0] != 0)
workSpace = gCtx.allocate(sizeInBytesArray[0]);
@@ -494,10 +494,10 @@ public class LibMatrixCUDA {
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(),
- srcTensorDesc, image,
- filterDesc, filter,
- convDesc, algo, workSpace, sizeInBytes, zero(),
- dstTensorDesc, output);
+ srcTensorDesc, image,
+ filterDesc, filter,
+ convDesc, algo, workSpace, sizeInBytes, zero(),
+ dstTensorDesc, output);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -564,8 +564,8 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
- LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
- long rows = input.getNumRows();
+ LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx);
+ long rows = input.getNumRows();
long cols = input.getNumColumns();
Pointer imagePointer = getDensePointer(gCtx, input, instName);
Pointer doutPointer = getDensePointer(gCtx, dout, instName);
@@ -574,8 +574,8 @@ public class LibMatrixCUDA {
long t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("relu_backward",
- ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
- imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols));
+ ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+ imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols));
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
}
@@ -593,8 +593,8 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
- LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
- if(isInSparseFormat(gCtx, input)) {
+ LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx);
+ if(isInSparseFormat(gCtx, input)) {
input.getGPUObject(gCtx).sparseToDense(instName);
}
if(isInSparseFormat(gCtx, bias)) {
@@ -613,8 +613,8 @@ public class LibMatrixCUDA {
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("bias_multiply",
- ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
- imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ));
+ ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)),
+ imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ));
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
@@ -660,13 +660,13 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException
*/
private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException {
- LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
- int PQ = cols / k;
+ LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx);
+ int PQ = cols / k;
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("bias_add",
- ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
- image, bias, output, rows, cols, PQ);
+ ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols),
+ image, bias, output, rows, cols, PQ);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
@@ -701,8 +701,8 @@ public class LibMatrixCUDA {
public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, double epsilon) throws DMLRuntimeException {
- LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
- int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+ LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
+ int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = toInt(image.getNumRows());
int C = toInt(scale.getNumColumns());
@@ -724,8 +724,8 @@ public class LibMatrixCUDA {
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
- scaleTensorDesc, scalePtr, biasPtr,
- runningMeanPtr, runningVarPtr, epsilon));
+ scaleTensorDesc, scalePtr, biasPtr,
+ runningMeanPtr, runningVarPtr, epsilon));
}
/**
@@ -747,8 +747,8 @@ public class LibMatrixCUDA {
public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
- LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
- int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+ LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
+ int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = toInt(image.getNumRows());
int C = toInt(scale.getNumColumns());
@@ -777,8 +777,8 @@ public class LibMatrixCUDA {
// ignoring resultSaveMean and resultSaveVariance as it requires state management
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
- scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
- retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
+ scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
+ retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
}
/**
@@ -852,8 +852,8 @@ public class LibMatrixCUDA {
public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
double epsilon) throws DMLRuntimeException {
- LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
- int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
+ LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
+ int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = toInt(image.getNumRows());
int C = toInt(scale.getNumColumns());
@@ -902,11 +902,11 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
- MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
- int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
- int Q) throws DMLRuntimeException {
- LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
- cudnnFilterDescriptor dwDesc = null;
+ MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+ int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+ int Q) throws DMLRuntimeException {
+ LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx);
+ cudnnFilterDescriptor dwDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
@@ -934,13 +934,13 @@ public class LibMatrixCUDA {
workSpace = new Pointer();
cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx),
- xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
+ xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer,
- doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
+ doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
@@ -948,7 +948,7 @@ public class LibMatrixCUDA {
throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
} catch (CudaException e) {
- throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
+ throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
} finally {
long t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
@@ -989,11 +989,11 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout,
- MatrixObject output, int N, int C, int H, int W, int K, int R,
- int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
- int Q) throws DMLRuntimeException {
- LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
- cudnnFilterDescriptor wDesc = null;
+ MatrixObject output, int N, int C, int H, int W, int K, int R,
+ int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+ int Q) throws DMLRuntimeException {
+ LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx);
+ cudnnFilterDescriptor wDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
@@ -1020,12 +1020,12 @@ public class LibMatrixCUDA {
int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
workSpace = new Pointer();
cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx),
- wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
+ wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w,
- dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
+ dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
@@ -1074,7 +1074,7 @@ public class LibMatrixCUDA {
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
- Pointer x = getDensePointer(gCtx, image, true, instName);
+ Pointer x = getDensePointer(gCtx, image, true, instName);
cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
@@ -1083,8 +1083,8 @@ public class LibMatrixCUDA {
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
- LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
- Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
+ LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx);
+ Pointer y = getDensePointer(gCtx, outputBlock, true, instName);
cudnnPoolingDescriptor poolingDesc = null;
try {
@@ -1138,11 +1138,11 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
- MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
- int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
- int Q) throws DMLRuntimeException {
- LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
- Pointer y = null;
+ MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+ int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
+ int Q) throws DMLRuntimeException {
+ LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
+ Pointer y = null;
cudnnPoolingDescriptor poolingDesc = null;
try {
@@ -1202,8 +1202,8 @@ public class LibMatrixCUDA {
private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
long t0=0;
try {
- LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
- cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
+ LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
+ cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
Pointer srcData = getDensePointer(gCtx, in, true, instName);
cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
@@ -1212,8 +1212,8 @@ public class LibMatrixCUDA {
cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor,
- one(), srcTensorDesc, srcData,
- zero(), dstTensorDesc, dstData);
+ one(), srcTensorDesc, srcData,
+ zero(), dstTensorDesc, dstData);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
} catch (CudaException e) {
throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e);
@@ -1245,14 +1245,14 @@ public class LibMatrixCUDA {
long t0=0;
cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor();
if(N*CHW >= numDoublesIn2GB || srcTensorDesc == null) {
- LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
- // Invokes relu(double* A, double* ret, int rlen, int clen)
+ LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
+ // Invokes relu(double* A, double* ret, int rlen, int clen)
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
Pointer dstData = getDensePointer(gCtx, output, instName);
Pointer srcData = getDensePointer(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu
getCudaKernels(gCtx).launchKernel("relu",
- ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
- srcData, dstData, toInt(N), toInt(CHW));
+ ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)),
+ srcData, dstData, toInt(N), toInt(CHW));
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
}
else {
@@ -1287,7 +1287,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName,
- boolean isLeftTransposed) throws DMLRuntimeException {
+ boolean isLeftTransposed) throws DMLRuntimeException {
LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx);
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
@@ -1303,7 +1303,7 @@ public class LibMatrixCUDA {
// Note: the dimensions are swapped
int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
-
+
// For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m); // Allocated the dense output matrix
@@ -1343,8 +1343,8 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException {
- LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
- if(isInSparseFormat(gCtx, ret)) {
+ LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx);
+ if(isInSparseFormat(gCtx, ret)) {
throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
}
if(ret.getNumRows() != ret.getNumColumns()) {
@@ -1352,8 +1352,8 @@ public class LibMatrixCUDA {
}
int dim = toInt(ret.getNumRows());
getCudaKernels(gCtx).launchKernel("copy_u2l_dense",
- ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
- getDensePointer(gCtx, ret, instName), dim, dim*dim);
+ ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
+ getDensePointer(gCtx, ret, instName), dim, dim*dim);
}
@@ -1389,7 +1389,7 @@ public class LibMatrixCUDA {
* @return output of matrix multiply
*/
public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+ boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
@@ -1400,7 +1400,7 @@ public class LibMatrixCUDA {
boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
MatrixObject output = ec.getMatrixObject(outputName);
-
+
long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
@@ -1436,7 +1436,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+ boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
@@ -1476,15 +1476,15 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
+ boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
throws DMLRuntimeException {
// right sparse, left dense
CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer ADense = getDensePointer(gCtx, left, instName);
if (B.isUltraSparse(k, n)){
- LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
+ LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
- // Convert left to CSR and do cuSparse matmul
+ // Convert left to CSR and do cuSparse matmul
int rowsA = (int)left.getNumRows();
int colsA = (int)left.getNumColumns();
@@ -1497,8 +1497,8 @@ public class LibMatrixCUDA {
CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
@@ -1507,7 +1507,7 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
- LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
+ LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
// Convert right to dense and do a cuBlas matmul
// BDenseTransposed is a column major matrix
// Note the arguments to denseDenseMatmult to accommodate for this.
@@ -1515,8 +1515,8 @@ public class LibMatrixCUDA {
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix
@@ -1550,7 +1550,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
+ boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
throws DMLRuntimeException {
CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
Pointer BDense = getDensePointer(gCtx, right, instName);
@@ -1577,8 +1577,8 @@ public class LibMatrixCUDA {
CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
@@ -1588,15 +1588,15 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
- LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
+ LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
// Convert left to dense and do a cuBlas matmul
// ADenseTransposed is a column major matrix
// Note the arguments to denseDenseMatmult to accommodate for this.
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix
@@ -1629,13 +1629,13 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed,
- int m, int k) throws DMLRuntimeException {
- LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
- int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+ int m, int k) throws DMLRuntimeException {
+ LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
+ int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
long size = m * Sizeof.DOUBLE;
if (isATranposed){
size = k * Sizeof.DOUBLE;
- transA = CUSPARSE_OPERATION_TRANSPOSE;
+ transA = CUSPARSE_OPERATION_TRANSPOSE;
}
Pointer C_dense = gCtx.allocate(instName, (int)size);
long t1=0;
@@ -1662,8 +1662,8 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
- int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
+ boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+ int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns());
int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
@@ -1701,7 +1701,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
- CSRPointer A, CSRPointer B) throws DMLRuntimeException {
+ CSRPointer A, CSRPointer B) throws DMLRuntimeException {
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
@@ -1726,11 +1726,11 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output,
- boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
- LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
+ boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
+ LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
- int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
- int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+ int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+ int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
long t0=0, t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
@@ -1741,9 +1741,9 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
- A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
- B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
- C.descr, C.val, C.rowPtr, C.colInd);
+ A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
+ B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
+ C.descr, C.val, C.rowPtr, C.colInd);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
}
@@ -1762,7 +1762,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
- boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+ boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
Pointer leftPtr = getDensePointer(gCtx, left, instName);
Pointer rightPtr = getDensePointer(gCtx, right, instName);
@@ -1773,7 +1773,7 @@ public class LibMatrixCUDA {
int rightCols = toInt(right.getNumColumns());
Pointer C = getDensePointer(gCtx, output, instName);
denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed,
- leftPtr, rightPtr);
+ leftPtr, rightPtr);
}
/**
@@ -1799,9 +1799,9 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
- int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
+ int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
throws DMLRuntimeException {
- LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
Pointer A = rightPtr;
Pointer B = leftPtr;
@@ -1892,7 +1892,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if {@link DMLRuntimeException} occurs
*/
public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op)
- throws DMLRuntimeException {
+ throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
@@ -1955,12 +1955,12 @@ public class LibMatrixCUDA {
} else if (aggOp.increOp.fn instanceof Builtin) {
Builtin b = (Builtin)aggOp.increOp.fn;
switch(b.bFunc) {
- case MAX: opIndex = OP_MAX; break;
- case MIN: opIndex = OP_MIN; break;
- case MAXINDEX: opIndex = OP_MAXINDEX; break;
- case MININDEX: opIndex = OP_MININDEX;break;
- default:
- new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
+ case MAX: opIndex = OP_MAX; break;
+ case MIN: opIndex = OP_MIN; break;
+ case MAXINDEX: opIndex = OP_MAXINDEX; break;
+ case MININDEX: opIndex = OP_MININDEX;break;
+ default:
+ new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
}
} else {
throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
@@ -1980,7 +1980,7 @@ public class LibMatrixCUDA {
// throw new DMLRuntimeException("Internal Error - Not implemented");
}
-
+
long outRLen = -1;
long outCLen = -1;
if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...}
@@ -2004,210 +2004,210 @@ public class LibMatrixCUDA {
// For scalars, set the scalar output in the Execution Context object
switch (opIndex){
- case OP_PLUS: {
- switch(reductionDirection) {
- case REDUCTION_ALL : {
- double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
- ec.setScalarOutput(output, new DoubleObject(result));
- break;
- }
- case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
- reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
- break;
- }
- case REDUCTION_ROW : {
- reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
- break;
- }
- case REDUCTION_DIAG :
- throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
- }
+ case OP_PLUS: {
+ switch(reductionDirection) {
+ case REDUCTION_ALL : {
+ double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+ ec.setScalarOutput(output, new DoubleObject(result));
break;
}
- case OP_PLUS_SQ : {
- // Calculate the squares in a temporary object tmp
- Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
-
- squareMatrix(gCtx, instName, in, tmp, rlen, clen);
- // Then do the sum on the temporary object and free it
- switch(reductionDirection) {
- case REDUCTION_ALL : {
- double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
- ec.setScalarOutput(output, new DoubleObject(result));
- break;
- }
- case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
- reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
- break;
- }
- case REDUCTION_ROW : {
- reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
- }
- gCtx.cudaFreeHelper(instName, tmp);
+ case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
+ reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
break;
}
- case OP_MEAN:{
- switch(reductionDirection) {
- case REDUCTION_ALL: {
- double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
- double mean = result / size;
- ec.setScalarOutput(output, new DoubleObject(mean));
- break;
- }
- case REDUCTION_COL: {
- reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
- break;
- }
- case REDUCTION_ROW: {
- reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
- }
+ case REDUCTION_ROW : {
+ reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
break;
}
- case OP_MULTIPLY : {
- switch (reductionDirection) {
- case REDUCTION_ALL: {
- double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
- ec.setScalarOutput(output, new DoubleObject(result));
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
- }
+ case REDUCTION_DIAG :
+ throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
+ }
+ break;
+ }
+ case OP_PLUS_SQ : {
+ // Calculate the squares in a temporary object tmp
+ Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+
+ squareMatrix(gCtx, instName, in, tmp, rlen, clen);
+ // Then do the sum on the temporary object and free it
+ switch(reductionDirection) {
+ case REDUCTION_ALL : {
+ double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
+ ec.setScalarOutput(output, new DoubleObject(result));
break;
}
- case OP_MAX :{
- switch(reductionDirection) {
- case REDUCTION_ALL: {
- double result = reduceAll(gCtx, instName, "reduce_max", in, size);
- ec.setScalarOutput(output, new DoubleObject(result));
- break;
- }
- case REDUCTION_COL: {
- reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
- break;
- }
- case REDUCTION_ROW: {
- reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
- }
+ case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
+ reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
break;
}
- case OP_MIN :{
- switch(reductionDirection) {
- case REDUCTION_ALL: {
- double result = reduceAll(gCtx, instName, "reduce_min", in, size);
- ec.setScalarOutput(output, new DoubleObject(result));
- break;
- }
- case REDUCTION_COL: {
- reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
- break;
- }
- case REDUCTION_ROW: {
- reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
- }
+ case REDUCTION_ROW : {
+ reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
+ break;
+ }
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
+ }
+ gCtx.cudaFreeHelper(instName, tmp);
+ break;
+ }
+ case OP_MEAN:{
+ switch(reductionDirection) {
+ case REDUCTION_ALL: {
+ double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+ double mean = result / size;
+ ec.setScalarOutput(output, new DoubleObject(mean));
+ break;
+ }
+ case REDUCTION_COL: {
+ reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
+ break;
+ }
+ case REDUCTION_ROW: {
+ reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
+ break;
+ }
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
+ }
+ break;
+ }
+ case OP_MULTIPLY : {
+ switch (reductionDirection) {
+ case REDUCTION_ALL: {
+ double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
+ ec.setScalarOutput(output, new DoubleObject(result));
+ break;
+ }
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
+ }
+ break;
+ }
+ case OP_MAX :{
+ switch(reductionDirection) {
+ case REDUCTION_ALL: {
+ double result = reduceAll(gCtx, instName, "reduce_max", in, size);
+ ec.setScalarOutput(output, new DoubleObject(result));
+ break;
+ }
+ case REDUCTION_COL: {
+ reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
+ break;
+ }
+ case REDUCTION_ROW: {
+ reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
+ break;
+ }
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
+ }
+ break;
+ }
+ case OP_MIN :{
+ switch(reductionDirection) {
+ case REDUCTION_ALL: {
+ double result = reduceAll(gCtx, instName, "reduce_min", in, size);
+ ec.setScalarOutput(output, new DoubleObject(result));
break;
}
- case OP_VARIANCE : {
- // Temporary GPU array for
- Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
- Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+ case REDUCTION_COL: {
+ reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
+ break;
+ }
+ case REDUCTION_ROW: {
+ reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
+ break;
+ }
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
+ }
+ break;
+ }
+ case OP_VARIANCE : {
+ // Temporary GPU array for
+ Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+ Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
- switch(reductionDirection) {
+ switch(reductionDirection) {
- case REDUCTION_ALL: {
- double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
- double mean = result / size;
+ case REDUCTION_ALL: {
+ double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
+ double mean = result / size;
- // Subtract mean from every element in the matrix
- ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
- matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
+ // Subtract mean from every element in the matrix
+ ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
+ matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);
- squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+ squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
- double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
- double variance = result2 / (size - 1);
- ec.setScalarOutput(output, new DoubleObject(variance));
+ double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
+ double variance = result2 / (size - 1);
+ ec.setScalarOutput(output, new DoubleObject(variance));
- break;
- }
- case REDUCTION_COL: {
- reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
- // Subtract the row-wise mean from every element in the matrix
- BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
- matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
+ break;
+ }
+ case REDUCTION_COL: {
+ reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
+ // Subtract the row-wise mean from every element in the matrix
+ BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+ matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
- squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+ squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
- Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
- reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
+ Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
+ reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
- ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
- matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
+ ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
+ matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
- gCtx.cudaFreeHelper(instName, tmpRow);
+ gCtx.cudaFreeHelper(instName, tmpRow);
- break;
- }
- case REDUCTION_ROW: {
- reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
- // Subtract the columns-wise mean from every element in the matrix
- BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
- matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
+ break;
+ }
+ case REDUCTION_ROW: {
+ reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
+ // Subtract the columns-wise mean from every element in the matrix
+ BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
+ matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
- squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
+ squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
- Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
- reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
+ Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
+ reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
- ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
- matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
+ ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
+ matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
- gCtx.cudaFreeHelper(instName, tmpCol);
+ gCtx.cudaFreeHelper(instName, tmpCol);
- break;
- }
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
- }
- gCtx.cudaFreeHelper(instName, tmp);
- gCtx.cudaFreeHelper(instName, tmp2);
break;
}
- case OP_MAXINDEX : {
- switch(reductionDirection) {
- case REDUCTION_COL:
- throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
- }
- // break;
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
}
- case OP_MININDEX : {
- switch(reductionDirection) {
- case REDUCTION_COL:
- throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
- default:
- throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
- }
- // break;
+ gCtx.cudaFreeHelper(instName, tmp);
+ gCtx.cudaFreeHelper(instName, tmp2);
+ break;
+ }
+ case OP_MAXINDEX : {
+ switch(reductionDirection) {
+ case REDUCTION_COL:
+ throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
+ }
+ // break;
+ }
+ case OP_MININDEX : {
+ switch(reductionDirection) {
+ case REDUCTION_COL:
+ throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
+ default:
+ throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
}
- default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
+ // break;
+ }
+ default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
}
}
@@ -2222,7 +2222,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if error
*/
private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
- ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
+ ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
matrixScalarOp(gCtx, instName, in, 2, rlen, clen, out, power2op);
}
@@ -2236,9 +2236,9 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
- LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx);
- int[] tmp = getKernelParamsForReduceAll(gCtx, n);
+ int[] tmp = getKernelParamsForReduceAll(gCtx, n);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
@@ -2256,7 +2256,7 @@ public class LibMatrixCUDA {
blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
- tempOut, tempOut, s);
+ tempOut, tempOut, s);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
s = (s + (threads*2-1)) / (threads*2);
}
@@ -2282,15 +2282,15 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
- LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx);
- int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
+ int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
- in, out, rows, cols);
+ in, out, rows, cols);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
@@ -2308,15 +2308,15 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
- LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx);
- int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
+ int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
- in, out, rows, cols);
+ in, out, rows, cols);
//cudaDeviceSynchronize;
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
}
@@ -2448,51 +2448,51 @@ public class LibMatrixCUDA {
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
double constant = op.getConstant();
LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx);
-
+
int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows();
int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns();
-
+
//boolean isCUDALibAvailable = (op.fn instanceof Multiply
// || (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in);
//if(!isCUDALibAvailable) {
- if(constant == 0) {
- if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
- deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
- }
- else if(op.fn instanceof Multiply || op.fn instanceof And) {
- setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
- }
- else if(op.fn instanceof Power) {
- setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
- }
- // TODO:
- // x/0.0 is either +Infinity or -Infinity according to Java.
- // In the context of a matrix, different elements of the matrix
- // could have different values.
- // If the IEEE 754 standard defines otherwise, this logic needs
- // to be re-enabled and the Java computation logic for divide by zero
- // needs to be revisited
- //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
- // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
- //}
- //else if(op.fn instanceof Divide) {
- // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
- // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
- //}
- else {
- // TODO: Potential to optimize
- matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
- }
+ if(constant == 0) {
+ if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
+ deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
}
- else if(constant == 1.0 && op.fn instanceof Or) {
- setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
+ else if(op.fn instanceof Multiply || op.fn instanceof And) {
+ setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen);
}
- else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
- deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
+ else if(op.fn instanceof Power) {
+ setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
}
+ // TODO:
+ // x/0.0 is either +Infinity or -Infinity according to Java.
+ // In the context of a matrix, different elements of the matrix
+ // could have different values.
+ // If the IEEE 754 standard defines otherwise, this logic needs
+ // to be re-enabled and the Java computation logic for divide by zero
+ // needs to be revisited
+ //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
+ // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
+ //}
+ //else if(op.fn instanceof Divide) {
+ // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
+ // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
+ //}
else {
+ // TODO: Potential to optimize
matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
}
+ }
+ else if(constant == 1.0 && op.fn instanceof Or) {
+ setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen);
+ }
+ else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
+ deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed);
+ }
+ else {
+ matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
+ }
// }
//else {
// double alpha = 0;
@@ -2506,8 +2506,8 @@ public class LibMatrixCUDA {
// throw new DMLRuntimeException("Unsupported op");
// }
- // TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
- // C = alpha* op( A ) + beta* op ( B )
+ // TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
+ // C = alpha* op( A ) + beta* op ( B )
// dgeam(ec, gCtx, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0);
//}
}
@@ -2563,15 +2563,15 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
- String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+ String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(gCtx, in1) && !isSparseAndEmpty(gCtx, in2) && !isVector(in1) && !isVector(in2);
if(!isCUDALibAvailable) {
- matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
+ matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
}
else {
- double alpha;
+ double alpha;
double beta;
if(op.fn instanceof Plus) {
alpha = 1.0;
@@ -2602,7 +2602,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed,
- ScalarOperator op) throws DMLRuntimeException {
+ ScalarOperator op) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
if(isInputTransposed)
@@ -2638,11 +2638,11 @@ public class LibMatrixCUDA {
int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
int size = rlenA * clenA;
long t0=0;
- if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
- ExecutionConfig.getConfigForSimpleVectorOperations(size),
- a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
+ if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
+ getCudaKernels(gCtx).launchKernel("matrix_scalar_op",
+ ExecutionConfig.getConfigForSimpleVectorOperations(size),
+ a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
}
/**
@@ -2660,7 +2660,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
- String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
+ String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
boolean isEmpty1 = isSparseAndEmpty(gCtx, in1);
@@ -2671,7 +2671,7 @@ public class LibMatrixCUDA {
int clenB = toInt(in2.getNumColumns());
int vecStatusA = getVectorStatus(rlenA, clenA).code();
int vecStatusB = getVectorStatus(rlenB, clenB).code();
-
+
if(isLeftTransposed || isRightTransposed) {
throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed);
}
@@ -2742,8 +2742,8 @@ public class LibMatrixCUDA {
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op",
- ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
- a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
+ ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
+ a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0);
}
@@ -2816,7 +2816,7 @@ public class LibMatrixCUDA {
@SuppressWarnings("unused")
private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal, double tolerance,
- double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
+ double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse kernel
@@ -2825,14 +2825,14 @@ public class LibMatrixCUDA {
int clen = toInt(out.getNumColumns());
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlen, clen); // Allocated the dense output matrix
Pointer ret = getDensePointer(gCtx, out, instName);
-
+
// out.getMatrixCharacteristics().setNonZeros(rlen*clen);
// compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal)
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
getCudaKernels(gCtx).launchKernel("compare_and_set",
- ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
- A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
+ ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
+ A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0);
}
@@ -2935,7 +2935,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName,
- boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
+ boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
@@ -2944,7 +2944,7 @@ public class LibMatrixCUDA {
Pointer betaPtr = pointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
-
+
long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows();
long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns();
@@ -3086,6 +3086,70 @@ public class LibMatrixCUDA {
//**************** Matrix Manipulation Functions *********************/
//********************************************************************/
+ /**
+ * Method to perform rangeReIndex operation for a given lower and upper bounds in row and column dimensions.
+ *
+ * @param ec current execution context
+ * @param gCtx current gpu context
+ * @param instName name of the instruction for maintaining statistics
+ * @param in1 input matrix object
+ * @param ixrange index range (0-based)
+ * @param outputName output matrix object
+ * @throws DMLRuntimeException if error occurs
+ */
+ public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1,
+ IndexRange ixrange, String outputName) throws DMLRuntimeException {
+ if (ec.getGPUContext(0) != gCtx)
+ throw new DMLRuntimeException(
+ "GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+ LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx);
+
+ int rl = (int) ixrange.rowStart;
+ int ru = (int) ixrange.rowEnd;
+ int cl = (int) ixrange.colStart;
+ int cu = (int) ixrange.colEnd;
+ if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0
+ || cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) {
+ throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + ","
+ + (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + ","
+ + in1.getNumColumns() + "]");
+ }
+
+ int len1 = toInt(in1.getNumColumns());
+ int len2 = toInt(ec.getMatrixObject(outputName).getNumColumns());
+ if(isInSparseFormat(gCtx, in1)) {
+ // Input in1 is in sparse format and output is in dense format
+ MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
+ CSRPointer inPointer = getSparsePointer(gCtx, in1, instName);
+ Pointer outPointer = getDensePointer(gCtx, out, instName);
+ int size = ru - rl + 1;
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ // Performs a slice operation where the input matrix is sparse and the output matrix is dense.
+ // This function avoids unnecessary sparse to dense conversion of the input matrix.
+ // We can generalize this later to output sparse matrix.
+ getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size),
+ inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu);
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0);
+ }
+ else {
+ // Input in1 is in dense format (see inPointer)
+ MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1);
+
+ Pointer inPointer = getDensePointer(gCtx, in1, instName);
+ Pointer outPointer = getDensePointer(gCtx, out, instName);
+ long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+ if (len1 == len2) {
+ cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1
+ * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+ } else {
+ for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) {
+ cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE),
+ inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+ }
+ }
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0);
+ }
+ }
public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
@@ -3093,7 +3157,7 @@ public class LibMatrixCUDA {
LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx);
long t1 = 0;
-
+
long rowsA = toInt(in1.getNumRows());
long colsA = toInt(in1.getNumColumns());
long rowsB = toInt(in2.getNumRows());
@@ -3114,8 +3178,8 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
getCudaKernels(gCtx)
- .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
- rowsA, colsA, rowsB, colsB);
+ .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
+ rowsA, colsA, rowsB, colsB);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1);
}
@@ -3126,7 +3190,7 @@ public class LibMatrixCUDA {
LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx);
long t1 = 0;
-
+
int rowsA = toInt(in1.getNumRows());
int colsA = toInt(in1.getNumColumns());
int rowsB = toInt(in2.getNumRows());
@@ -3147,8 +3211,8 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
getCudaKernels(gCtx)
- .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
- rowsA, colsA, rowsB, colsB);
+ .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C,
+ rowsA, colsA, rowsB, colsB);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1);
}
@@ -3422,7 +3486,7 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2,
- String outputName, double constant) throws DMLRuntimeException {
+ String outputName, double constant) throws DMLRuntimeException {
if (ec.getGPUContext(0) != gCtx)
throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
Pointer A = getDensePointer(gCtx, in1, instName);
@@ -3433,9 +3497,9 @@ public class LibMatrixCUDA {
long t1=0, t2=0;
if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
- LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
- // Matrix-Matrix daxpy
+ // Matrix-Matrix daxpy
long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
Pointer alphaPtr = pointerTo(constant);
// C <- A + alpha*B
@@ -3451,9 +3515,9 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
}
else {
- LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
+ LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx);
- // Matrix-Vector daxpy
+ // Matrix-Vector daxpy
// Note: Vector-Matrix operation is not supported
// daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
@@ -3466,51 +3530,51 @@ public class LibMatrixCUDA {
}
- /**
- * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
- *
- * @param ec a valid {@link ExecutionContext}
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param in1 input matrix A
- * @param in2 input matrix B
- * @param outputName name of the output matrix
- * @throws DMLRuntimeException if an error occurs
- */
- public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
- if (ec.getGPUContext(0) != gCtx)
- throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
+ /**
+ * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1)
+ *
+ * @param ec a valid {@link ExecutionContext}
+ * @param gCtx a valid {@link GPUContext}
+ * @param instName the invoking instruction's name for record {@link Statistics}.
+ * @param in1 input matrix A
+ * @param in2 input matrix B
+ * @param outputName name of the output matrix
+ * @throws DMLRuntimeException if an error occurs
+ */
+ public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException {
+ if (ec.getGPUContext(0) != gCtx)
+ throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
- // x = solve(A, b)
+ // x = solve(A, b)
LOG.trace("GPU : solve" + ", GPUContext=" + gCtx);
long t0 = -1;
- if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // Both dense
- GPUObject Aobj = in1.getGPUObject(gCtx);
- GPUObject bobj = in2.getGPUObject(gCtx);
- int m = toInt(in1.getNumRows());
- int n = toInt(in1.getNumColumns());
- if (in2.getNumRows() != m)
- throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
- if (in2.getNumColumns() != 1)
- throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
-
-
- // Copy over matrices and
- // convert dense matrices to row major
- // Operation in cuSolver and cuBlas are for column major dense matrices
- // and are destructive to the original input
+ if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // Both dense
+ GPUObject Aobj = in1.getGPUObject(gCtx);
+ GPUObject bobj = in2.getGPUObject(gCtx);
+ int m = toInt(in1.getNumRows());
+ int n = toInt(in1.getNumColumns());
+ if (in2.getNumRows() != m)
+ throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B");
+ if (in2.getNumColumns() != 1)
+ throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1");
+
+
+ // Copy over matrices and
+ // convert dense matrices to row major
+ // Operation in cuSolver and cuBlas are for column major dense matrices
+ // and are destructive to the original input
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- GPUObject ATobj = (GPUObject) Aobj.clone();
+ GPUObject ATobj = (GPUObject) Aobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
ATobj.denseRowMajorToColumnMajor();
- if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
- Pointer A = ATobj.getJcudaDenseMatrixPtr();
+ if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0);
+ Pointer A = ATobj.getJcudaDenseMatrixPtr();
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
- GPUObject bTobj = (GPUObject) bobj.clone();
+ GPUObject bTobj = (GPUObject) bobj.clone();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
bTobj.denseRowMajorToColumnMajor();
@@ -3518,76 +3582,76 @@ public class LibMatrixCUDA {
Pointer b = bTobj.getJcudaDenseMatrixPtr();
- // The following set of operations is done following the example in the cusolver documentation
- // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
+ // The following set of operations is done following the example in the cusolver documentation
+ // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1
- // step 3: query working space of geqrf and ormqr
+ // step 3: query working space of geqrf and ormqr
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
int[] lwork = {0};
- JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
+ JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
// step 4: compute QR factorization
- Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
- Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
- Pointer devInfo = gCtx.allocate(Sizeof.INT);
+ Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
+ Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
+ Pointer devInfo = gCtx.allocate(Sizeof.INT);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
int[] qrError = {-1};
- cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
- if (qrError[0] != 0) {
- throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
- }
+ cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
+ if (qrError[0] != 0) {
+ throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
+ }
- // step 5: compute Q^T*B
+ // step 5: compute Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
- if (qrError[0] != 0) {
- throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
- }
+ if (qrError[0] != 0) {
+ throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong");
+ }
- // step 6: compute x = R \ Q^T*B
+ // step 6: compute x = R \ Q^T*B
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
- cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
- n, 1, pointerTo(1.0), A, m, b, m);
+ cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
+ n, 1, pointerTo(1.0), A, m, b, m);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
bTobj.denseColumnMajorToRowMajor();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0);
- // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
- // There is an avoidable copy happening here
+ // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
+ // There is an avoidable copy happening here
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
- cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+ cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
- gCtx.cudaFreeHelper(instName, work);
- gCtx.cudaFreeHelper(instName, tau);
- ATobj.clearData();
- bTobj.clearData();
+ gCtx.cudaFreeHelper(instName, work);
+ gCtx.cudaFreeHelper(instName, tau);
+ ATobj.clearData();
+ bTobj.clearData();
- //debugPrintMatrix(b, n, 1);
+ //debugPrintMatrix(b, n, 1);
- } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse
- throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
- } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse
- // Pointer A = getDensePointer(gCtx, in1, instName);
- // Pointer B = getDensePointer(gCtx, in2, instName);
- throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
- } else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // A is sparse, b is dense
- throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
- }
+ } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse
+ throw new DMLRuntimeException("GPU : solve on sparse inputs not supported");
+ } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse
+ // Pointer A = getDensePointer(gCtx, in1, instName);
+ // Pointer B = getDensePointer(gCtx, in2, instName);
+ throw new DMLRuntimeException("GPU : solv
<TRUNCATED>