You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/07/25 16:28:48 UTC
[systemds] 02/02: [SYSTEMDS-3073] Performance ultra-sparse
transpose operations
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 85c54928d78679137222dc0de1057eb75e39fa3f
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Jul 25 18:28:05 2021 +0200
[SYSTEMDS-3073] Performance ultra-sparse transpose operations
This patch adds an additional specialized code path for ultra-sparse
transpose operations for nnz < max(rlen, clen) in order to avoid
unnecessary overhead for cache blocking and preallocation. On the
following scenario it improve the cummulative transpose time from 5.061s
to 0.546s.
X = rand(rows=1e6, cols=1e6, sparsity=1e-11);
for(i in 1:100)
X = t(X)
print(sum(X))
---
.../sysds/runtime/matrix/data/LibMatrixReorg.java | 32 ++++++++++++++++------
1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index ecad23c..72d4ace 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -169,15 +169,18 @@ public class LibMatrixReorg {
out.allocateDenseBlock(false);
//execute transpose operation
+ boolean ultraSparse = (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen));
if( !in.sparse && !out.sparse )
- transposeDenseToDense( in, out, 0, in.rlen, 0, in.clen );
+ transposeDenseToDense(in, out, 0, in.rlen, 0, in.clen);
+ else if( ultraSparse )
+ transposeUltraSparse(in, out);
else if( in.sparse && out.sparse )
- transposeSparseToSparse( in, out, 0, in.rlen, 0, in.clen,
+ transposeSparseToSparse(in, out, 0, in.rlen, 0, in.clen,
countNnzPerColumn(in, 0, in.rlen));
else if( in.sparse )
- transposeSparseToDense( in, out, 0, in.rlen, 0, in.clen );
+ transposeSparseToDense(in, out, 0, in.rlen, 0, in.clen);
else
- transposeDenseToSparse( in, out );
+ transposeDenseToSparse(in, out);
// System.out.println("r' ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms.");
@@ -190,10 +193,11 @@ public class LibMatrixReorg {
public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out, int k, boolean allowCSR) {
// redirect small or special cases to sequential execution
- if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1 ||
- (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1)) ||
- (in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1) ||
- (!in.sparse && out.sparse)) {
+ if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1
+ || (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1))
+ || (in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1)
+ || (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen)) //ultra-sparse
+ || (!in.sparse && out.sparse) ) {
return transpose(in, out);
}
// set meta data and allocate output arrays (if required)
@@ -901,6 +905,18 @@ public class LibMatrixReorg {
}
}
+ private static void transposeUltraSparse(MatrixBlock in, MatrixBlock out) {
+ //note: applied if nnz < max(rlen, clen) - so no cache blocking
+ // but basic, naive transposition in a single-threaded context
+ Iterator<IJV> iter = in.getSparseBlockIterator();
+ SparseBlock b = out.getSparseBlock();
+ while( iter.hasNext() ) {
+ IJV cell = iter.next();
+ b.append(cell.getJ(), cell.getI(), cell.getV());
+ }
+ out.setNonZeros(in.getNonZeros());
+ }
+
private static void transposeSparseToSparse(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt)
{
//NOTE: called only in sequential or column-wise parallel execution