You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/13 23:04:19 UTC
[3/3] systemml git commit: [SYSTEMML-445] Improved performance of GPU
right indexing
[SYSTEMML-445] Improved performance of GPU right indexing
- Added slice_dense_dense kernel for right indexing
- Fixed GPU bufferpool performance bug where we were doing unnecessary
recomputation of nnz for sparse input.
- Refactored asserts to throw exceptions to be consistent with our other
operators. Also, this allows our other APIs as well as production jar to
catch unexpected bugs.
Closes #663.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4cf95c92
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4cf95c92
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4cf95c92
Branch: refs/heads/master
Commit: 4cf95c92e61902ebcc17a377d4c16efc108a60f4
Parents: e624d14
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Sep 13 15:58:56 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Sep 13 15:58:56 2017 -0700
----------------------------------------------------------------------
conf/SystemML-config.xml.template | 3 +
src/main/cpp/kernels/SystemML.cu | 29 +-
src/main/cpp/kernels/SystemML.ptx | 2864 +++++++++---------
.../java/org/apache/sysml/api/DMLScript.java | 2 +
.../apache/sysml/api/ScriptExecutorUtils.java | 1 +
.../java/org/apache/sysml/conf/DMLConfig.java | 6 +-
.../context/ExecutionContext.java | 16 +-
.../runtime/instructions/cp/CPInstruction.java | 1 +
.../instructions/gpu/GPUInstruction.java | 16 +-
.../instructions/gpu/context/CSRPointer.java | 3 +-
.../instructions/gpu/context/GPUContext.java | 87 +-
.../instructions/gpu/context/GPUObject.java | 170 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 208 +-
.../runtime/matrix/data/LibMatrixCuDNN.java | 42 +-
.../sysml/runtime/matrix/data/MatrixBlock.java | 9 +
15 files changed, 1879 insertions(+), 1578 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 454d0cc..aaf7316 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -84,6 +84,9 @@
<!-- sets the GPUs to use per process, -1 for all GPUs, a specific GPU number (5), a range (eg: 0-2) or a comma separated list (eg: 0,2,4)-->
<systemml.gpu.availableGPUs>-1</systemml.gpu.availableGPUs>
+ <!-- whether to synchronize GPUs after every GPU instruction -->
+ <systemml.gpu.sync.postProcess>true</systemml.gpu.sync.postProcess>
+
<!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
<systemml.stats.maxWrapLength>30</systemml.stats.maxWrapLength>
</root>
http://git-wip-us.apache.org/repos/asf/systemml/blob/4cf95c92/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index bb6482d..231a32a 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -39,13 +39,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu
* @param ru row upper
* @param cl column lower
* @param cu column upper
+ * @param retClen number of columns of output matrix
*/
extern "C"
-__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, double* ret, int rl, int ru, int cl, int cu) {
+__global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, double* ret, int rl, int ru, int cl, int cu, int retClen) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int rowIndex = index + rl;
if (rowIndex <= ru){
- int retClen = cu - cl + 1;
// Iterate over elements of the row 'rowIndex'.
for(int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex+1]; i++) {
// Only slice if the index falls into the given range
@@ -56,6 +56,31 @@ __global__ void slice_sparse_dense(double* inVal, int* inRowPtr, int* colInd, do
}
}
+/**
+ * Performs a slice operation where the input matrix is dense and the output matrix is dense.
+ *
+ * @params in dense input pointer
+ * @params ret dense output pointer
+ * @param rl row lower
+ * @param ru row upper
+ * @param cl column lower
+ * @param cu column upper
+ * @param inClen number of columns of input matrix
+ * @param retClen number of columns of output matrix
+ */
+extern "C"
+__global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int cl, int cu, int inClen, int retClen) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int rowIndex = index + rl;
+ if (rowIndex <= ru){
+ int inIndex = rowIndex*inClen + cl;
+ int retIndex = index*retClen;
+ for(int i = retIndex; i < retIndex+retClen; i++, inIndex++) {
+ ret[i] = in[inIndex];
+ }
+ }
+}
+
/**
* Does a copy of upper to lower triangle of the given matrix