You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/07/25 16:28:48 UTC

[systemds] 02/02: [SYSTEMDS-3073] Performance ultra-sparse transpose operations

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 85c54928d78679137222dc0de1057eb75e39fa3f
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Jul 25 18:28:05 2021 +0200

    [SYSTEMDS-3073] Performance ultra-sparse transpose operations
    
    This patch adds an additional specialized code path for ultra-sparse
    transpose operations for nnz < max(rlen, clen) in order to avoid
    unnecessary overhead for cache blocking and preallocation. On the
    following scenario it improve the cummulative transpose time from 5.061s
    to 0.546s.
    
    X = rand(rows=1e6, cols=1e6, sparsity=1e-11);
    for(i in 1:100)
      X = t(X)
    print(sum(X))
---
 .../sysds/runtime/matrix/data/LibMatrixReorg.java  | 32 ++++++++++++++++------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index ecad23c..72d4ace 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -169,15 +169,18 @@ public class LibMatrixReorg {
 			out.allocateDenseBlock(false);
 	
 		//execute transpose operation
+		boolean ultraSparse = (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen));
 		if( !in.sparse && !out.sparse )
-			transposeDenseToDense( in, out, 0, in.rlen, 0, in.clen );
+			transposeDenseToDense(in, out, 0, in.rlen, 0, in.clen);
+		else if( ultraSparse )
+			transposeUltraSparse(in, out);
 		else if( in.sparse && out.sparse )
-			transposeSparseToSparse( in, out, 0, in.rlen, 0, in.clen, 
+			transposeSparseToSparse(in, out, 0, in.rlen, 0, in.clen, 
 				countNnzPerColumn(in, 0, in.rlen));
 		else if( in.sparse )
-			transposeSparseToDense( in, out, 0, in.rlen, 0, in.clen );
+			transposeSparseToDense(in, out, 0, in.rlen, 0, in.clen);
 		else
-			transposeDenseToSparse( in, out );
+			transposeDenseToSparse(in, out);
 		
 		// System.out.println("r' ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms.");
 		
@@ -190,10 +193,11 @@ public class LibMatrixReorg {
 	
 	public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out, int k, boolean allowCSR) {
 		// redirect small or special cases to sequential execution
-		if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1 ||
-			(SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1)) ||
-			(in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1) ||
-			(!in.sparse && out.sparse)) {
+		if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1
+			|| (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1))
+			|| (in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1)
+			|| (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen)) //ultra-sparse
+			|| (!in.sparse && out.sparse) ) {
 			return transpose(in, out);
 		}
 		// set meta data and allocate output arrays (if required)
@@ -901,6 +905,18 @@ public class LibMatrixReorg {
 		}
 	}
 
+	private static void transposeUltraSparse(MatrixBlock in, MatrixBlock out) {
+		//note: applied if nnz < max(rlen, clen) - so no cache blocking
+		// but basic, naive transposition in a single-threaded context
+		Iterator<IJV> iter = in.getSparseBlockIterator();
+		SparseBlock b = out.getSparseBlock();
+		while( iter.hasNext() ) {
+			IJV cell = iter.next();
+			b.append(cell.getJ(), cell.getI(), cell.getV());
+		}
+		out.setNonZeros(in.getNonZeros());
+	}
+	
 	private static void transposeSparseToSparse(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt)
 	{
 		//NOTE: called only in sequential or column-wise parallel execution