You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/08/06 22:14:56 UTC

systemml git commit: [SYSTEMML-2486] Performance bitset sparsity estimator (cache-conscious)

Repository: systemml
Updated Branches:
  refs/heads/master a11933002 -> e3a51f72a


[SYSTEMML-2486] Performance bitset sparsity estimator (cache-conscious)

This patch reworks the bitset sparsity estimator into two alternative
implementations via Java's BitSet per row or a linearized long array.
The latter was a precondition for a cache-conscious, i.e., blocked,
implementation.

With this patch the measured performance improvements for estimating
C=AB were as follows:

50K x 50K, sp=0.01 (2x): 1,372ms -> 796ms
100K x 100K, sp=0.01 (2x): 11,723ms -> 5,837ms
100K x 100K, sp=0.1 (2x): 71,697ms -> 35,855ms


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e3a51f72
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e3a51f72
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e3a51f72

Branch: refs/heads/master
Commit: e3a51f72a88a6604e404e80b6ec2ad1fb7e9c689
Parents: a119330
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Aug 5 22:59:14 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Aug 6 15:16:23 2018 -0700

----------------------------------------------------------------------
 .../sysml/hops/estim/EstimatorBitsetMM.java     | 232 ++++++++++++++++---
 1 file changed, 197 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e3a51f72/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 9bde8c8..38de751 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -48,10 +48,10 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 			estim(root.getLeft()); // obtain synopsis
 		if (!root.getRight().isLeaf())
 			estim(root.getLeft()); // obtain synopsis
-		BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) root.getLeft().getSynopsis()
-				: new BitsetMatrix(root.getLeft().getData());
-		BitsetMatrix m2Map = !root.getRight().isLeaf() ? (BitsetMatrix) root.getRight().getSynopsis()
-				: new BitsetMatrix(root.getRight().getData());
+		BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) root.getLeft().getSynopsis() :
+			new BitsetMatrix1(root.getLeft().getData());
+		BitsetMatrix m2Map = !root.getRight().isLeaf() ? (BitsetMatrix) root.getRight().getSynopsis() :
+			new BitsetMatrix1(root.getRight().getData());
 
 		// estimate output density map and sparsity via boolean matrix mult
 		BitsetMatrix outMap = m1Map.matMult(m2Map);
@@ -61,9 +61,9 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 
 	@Override
 	public double estim(MatrixBlock m1, MatrixBlock m2) {
-		BitsetMatrix m1Map = new BitsetMatrix(m1);
+		BitsetMatrix m1Map = new BitsetMatrix1(m1);
 		BitsetMatrix m2Map = (m1 == m2) ? //self product
-			m1Map : new BitsetMatrix(m2);
+			m1Map : new BitsetMatrix1(m2);
 		BitsetMatrix outMap = m1Map.matMult(m2Map);
 		return OptimizerUtils.getSparsity( // aggregate output histogram
 				outMap.getNumRows(), outMap.getNumColumns(), outMap.getNonZeros());
@@ -79,24 +79,17 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 		throw new NotImplementedException();
 	}
 
-	private static class BitsetMatrix {
-		private final int _rlen;
-		private final int _clen;
-		private long _nonZeros;
-		private BitSet[] _data;
-
+	private abstract static class BitsetMatrix {
+		protected final int _rlen;
+		protected final int _clen;
+		protected long _nonZeros;
+	
 		public BitsetMatrix(int rlen, int clen) {
 			_rlen = rlen;
 			_clen = clen;
-			_data = new BitSet[_rlen];
 			_nonZeros = 0;
 		}
-
-		public BitsetMatrix(MatrixBlock in) {
-			this(in.getNumRows(), in.getNumColumns());
-			init(in);
-		}
-
+		
 		public int getNumRows() {
 			return _rlen;
 		}
@@ -108,49 +101,215 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 		public long getNonZeros() {
 			return _nonZeros;
 		}
-
-		private void init(MatrixBlock in) {
+		
+		protected void init(MatrixBlock in) {
 			if (in.isEmptyBlock(false))
 				return;
 			if( MULTI_THREADED_BUILD && in.getNonZeros() > MIN_PAR_THRESHOLD ) {
-				int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+				int k = 4 * InfrastructureAnalyzer.getLocalParallelism();
 				int blklen = (int)Math.ceil((double)_rlen/k);
 				IntStream.range(0, k).parallel().forEach(i ->
-					buildIntern(this, in, i*blklen, Math.min((i+1)*blklen, _rlen)));
+					buildIntern(in, i*blklen, Math.min((i+1)*blklen, _rlen)));
 			}
 			else {
 				//single-threaded bitset construction
-				buildIntern(this, in, 0, in.getNumRows());
+				buildIntern(in, 0, in.getNumRows());
 			}
 			_nonZeros = in.getNonZeros();
 		}
 
 		public BitsetMatrix matMult(BitsetMatrix m2) {
-			BitsetMatrix out = new BitsetMatrix(_rlen, m2._clen);
+			BitsetMatrix out = createBitSetMatrix(_rlen, m2._clen);
 			if( this.getNonZeros() == 0 || m2.getNonZeros() == 0 )
 				return out;
 			long size = (long)_rlen*_clen+(long)m2._rlen*m2._clen;
 			if( MULTI_THREADED_ESTIM && size > MIN_PAR_THRESHOLD ) {
-				int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+				int k = 4 * InfrastructureAnalyzer.getLocalParallelism();
 				int blklen = (int)Math.ceil((double)_rlen/k);
 				out._nonZeros = IntStream.range(0, k).parallel().mapToLong(i ->
-					matMultIntern(this, m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
+					matMultIntern(m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
 			}
 			else {
 				//single-threaded boolean matrix mult
-				out._nonZeros = matMultIntern(this, m2, out, 0, _rlen);
+				out._nonZeros = matMultIntern(m2, out, 0, _rlen);
 			}
 			return out;
 		}
 		
-		private static void buildIntern(BitsetMatrix bitset, MatrixBlock in, int rl, int ru) {
+		protected abstract BitsetMatrix createBitSetMatrix(int rlen, int clen);
+		
+		protected abstract void buildIntern(MatrixBlock in, int rl, int ru);
+		
+		protected abstract long matMultIntern(BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru);
+	}
+	
+	/**
+	 * This class represents a boolean matrix and provides key operations.
+	 * In the interest of a cache-conscious matrix multiplication and reduced
+	 * memory overhead, we use a linearized and padded array of longs instead
+	 * of Java's BitSet per row (which causes memory overheads per row and does
+	 * not allow for range ORs). However, this implies a maximum size of 16GB.
+	 * 
+	 */
+	private static class BitsetMatrix1 extends BitsetMatrix {
+		//linearized and padded data array in row-major order, where each long
+		//represents 64 boolean values, all rows are aligned at 64 for simple access
+		private final int _rowLen;
+		private final long[] _data;
+
+		public BitsetMatrix1(int rlen, int clen) {
+			super(rlen, clen);
+			_rowLen = (int) Math.ceil((double)clen / 64);
+			_data = new long[rlen * _rowLen];
+		}
+		
+		public BitsetMatrix1(MatrixBlock in) {
+			this(in.getNumRows(), in.getNumColumns());
+			init(in);
+		}
+		
+		@Override
+		protected BitsetMatrix createBitSetMatrix(int rlen, int clen) {
+			return new BitsetMatrix1(rlen, clen);
+		}
+		
+		@Override
+		protected void buildIntern(MatrixBlock in, int rl, int ru) {
+			if (in.isInSparseFormat()) {
+				SparseBlock sblock = in.getSparseBlock();
+				for (int i = rl; i < ru; i++) {
+					if (sblock.isEmpty(i))
+						continue;
+					int alen = sblock.size(i);
+					int apos = sblock.pos(i);
+					int[] aix = sblock.indexes(i);
+					for (int k = apos; k < apos + alen; k++)
+						set(i, aix[k]);
+				}
+			} else {
+				DenseBlock dblock = in.getDenseBlock();
+				for (int i = rl; i < ru; i++) {
+					double[] avals = dblock.values(i);
+					int aix = dblock.pos(i);
+					for (int j = 0; j < in.getNumColumns(); j++)
+						if (avals[aix + j] != 0)
+							set(i, j);
+				}
+			}
+		}
+		
+		@Override
+		protected long matMultIntern(BitsetMatrix bsb2, BitsetMatrix bsc2, int rl, int ru) {
+			BitsetMatrix1 bsb = (BitsetMatrix1) bsb2;
+			BitsetMatrix1 bsc = (BitsetMatrix1) bsc2;
+			final long[] b = bsb._data;
+			final long[] c = bsc._data;
+			final int cd = _clen;
+			final int n = bsb._clen;
+			final int n64 = bsb._rowLen;
+			
+			final int blocksizeI = 32;
+			final int blocksizeK = 24;
+			final int blocksizeJ = 1024 * 64;
+			
+			long lnnz = 0;
+			
+			//blocked execution (cache-conscious)
+			for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+				int bimin = Math.min(ru, bi+blocksizeI);
+				for( int bk = 0; bk < cd; bk+=blocksizeK ) {
+					int bkmin = Math.min(cd, bk+blocksizeK);
+					for( int bj = 0; bj < n; bj+=blocksizeJ ) {
+						//core sub block matrix multiplication
+						int bjlen64 = (int)Math.ceil((double)(Math.min(n, bj+blocksizeJ)-bj)/64);
+						int bj64 = bj/64;
+						for( int i = bi, off=i*_rowLen; i < bimin; i++, off+=_rowLen) {
+							for( int k = bk; k < bkmin; k++ ) {
+								if( getCol(off, k) ) //implicit and
+									or(b, c, k*n64+bj64, i*n64+bj64, bjlen64);
+							}
+						}
+					}
+				}
+				// maintain nnz for entire output row block
+				lnnz += card(c, bi*n64, (bimin-bi)*n64);
+			}
+			
+			return lnnz;
+		}
+		
+		private void set(int r, int c) {
+			int off = r * _rowLen;
+			int wordIndex = wordIndex(c); //see BitSet.java
+			_data[off + wordIndex] |= (1L << c); //see BitSet.java
+		}
+		
+		@SuppressWarnings("unused")
+		private boolean get(int r, int c) {
+			int off = r * _rowLen;
+			int wordIndex = wordIndex(c); //see Bitset.java
+			return (_data[off + wordIndex] & (1L << c)) != 0; //see BitSet.java
+		}
+		
+		private boolean getCol(int off, int c) {
+			int wordIndex = wordIndex(c); //see Bitset.java
+			return (_data[off + wordIndex] & (1L << c)) != 0; //see BitSet.java
+		}
+		
+		private static int wordIndex(int bitIndex) {
+			return bitIndex >> 6; //see BitSet.java
+		}
+		
+		private static int card(long[] c, int ci, int len) {
+			int sum = 0;
+			for( int i = ci; i < ci+len; i++ )
+				sum += Long.bitCount(c[i]);
+			return sum;
+		}
+		
+		private static void or(long[] b, long[] c, int bi, int ci, int len) {
+			final int bn = len%8;
+			//compute rest
+			for( int i = 0; i < bn; i++, bi++, ci++ )
+				c[ci] |= b[bi];
+			//unrolled 8-block (for better instruction-level parallelism)
+			for( int i = bn; i < len; i+=8, bi+=8, ci+=8 ) {
+				c[ci+0] |= b[bi+0]; c[ci+1] |= b[bi+1];
+				c[ci+2] |= b[bi+2]; c[ci+3] |= b[bi+3];
+				c[ci+4] |= b[bi+4]; c[ci+5] |= b[bi+5];
+				c[ci+6] |= b[bi+4]; c[ci+7] |= b[bi+7];
+			}
+		}
+	}
+	
+	@SuppressWarnings("unused")
+	private static class BitsetMatrix2 extends BitsetMatrix {
+		private BitSet[] _data;
+
+		public BitsetMatrix2(int rlen, int clen) {
+			super(rlen, clen);
+			_data = new BitSet[_rlen];
+		}
+
+		public BitsetMatrix2(MatrixBlock in) {
+			this(in.getNumRows(), in.getNumColumns());
+			init(in);
+		}
+		
+		@Override
+		protected BitsetMatrix createBitSetMatrix(int rlen, int clen) {
+			return new BitsetMatrix2(rlen, clen);
+		}
+		
+		@Override
+		protected void buildIntern(MatrixBlock in, int rl, int ru) {
 			final int clen = in.getNumColumns();
 			if (in.isInSparseFormat()) {
 				SparseBlock sblock = in.getSparseBlock();
 				for (int i = rl; i < ru; i++) {
 					if (sblock.isEmpty(i))
 						continue;
-					BitSet lbs = bitset._data[i] = new BitSet(clen);
+					BitSet lbs = _data[i] = new BitSet(clen);
 					int alen = sblock.size(i);
 					int apos = sblock.pos(i);
 					int[] aix = sblock.indexes(i);
@@ -159,8 +318,8 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 				}
 			} else {
 				DenseBlock dblock = in.getDenseBlock();
-				for (int i = 0; i < in.getNumRows(); i++) {
-					BitSet lbs = bitset._data[i] = new BitSet(clen);
+				for (int i = rl; i < ru; i++) {
+					BitSet lbs = _data[i] = new BitSet(clen);
 					double[] avals = dblock.values(i);
 					int aix = dblock.pos(i);
 					for (int j = 0; j < in.getNumColumns(); j++)
@@ -170,13 +329,16 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 			}
 		}
 		
-		private static long matMultIntern(BitsetMatrix bsa, BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru) {
-			final int cd = bsa._clen;
+		@Override
+		protected long matMultIntern(BitsetMatrix bsb2, BitsetMatrix bsc2, int rl, int ru) {
+			BitsetMatrix2 bsb = (BitsetMatrix2) bsb2;
+			BitsetMatrix2 bsc = (BitsetMatrix2) bsc2;
+			final int cd = _clen;
 			final int n = bsb._clen;
 			// matrix multiply with IKJ schedule and pure OR ops in inner loop
 			long lnnz = 0;
 			for (int i = rl; i < ru; i++) {
-				BitSet a = bsa._data[i];
+				BitSet a = _data[i];
 				if( a != null ) {
 					BitSet c = bsc._data[i] = new BitSet(n);
 					for (int k = 0; k < cd; k++) {