You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/08/06 22:14:56 UTC
systemml git commit: [SYSTEMML-2486] Performance bitset sparsity
estimator (cache-conscious)
Repository: systemml
Updated Branches:
refs/heads/master a11933002 -> e3a51f72a
[SYSTEMML-2486] Performance bitset sparsity estimator (cache-conscious)
This patch reworks the bitset sparsity estimator into two alternative
implementations via Java's BitSet per row or a linearized long array.
The latter was a precondition for a cache-conscious, i.e., blocked,
implementation.
With this patch the measured performance improvements for estimating
C=AB were as follows:
50K x 50K, sp=0.01 (2x): 1,372ms -> 796ms
100K x 100K, sp=0.01 (2x): 11,723ms -> 5,837ms
100K x 100K, sp=0.1 (2x): 71,697ms -> 35,855ms
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e3a51f72
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e3a51f72
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e3a51f72
Branch: refs/heads/master
Commit: e3a51f72a88a6604e404e80b6ec2ad1fb7e9c689
Parents: a119330
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Aug 5 22:59:14 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Aug 6 15:16:23 2018 -0700
----------------------------------------------------------------------
.../sysml/hops/estim/EstimatorBitsetMM.java | 232 ++++++++++++++++---
1 file changed, 197 insertions(+), 35 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/e3a51f72/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 9bde8c8..38de751 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -48,10 +48,10 @@ public class EstimatorBitsetMM extends SparsityEstimator {
estim(root.getLeft()); // obtain synopsis
if (!root.getRight().isLeaf())
estim(root.getLeft()); // obtain synopsis
- BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) root.getLeft().getSynopsis()
- : new BitsetMatrix(root.getLeft().getData());
- BitsetMatrix m2Map = !root.getRight().isLeaf() ? (BitsetMatrix) root.getRight().getSynopsis()
- : new BitsetMatrix(root.getRight().getData());
+ BitsetMatrix m1Map = !root.getLeft().isLeaf() ? (BitsetMatrix) root.getLeft().getSynopsis() :
+ new BitsetMatrix1(root.getLeft().getData());
+ BitsetMatrix m2Map = !root.getRight().isLeaf() ? (BitsetMatrix) root.getRight().getSynopsis() :
+ new BitsetMatrix1(root.getRight().getData());
// estimate output density map and sparsity via boolean matrix mult
BitsetMatrix outMap = m1Map.matMult(m2Map);
@@ -61,9 +61,9 @@ public class EstimatorBitsetMM extends SparsityEstimator {
@Override
public double estim(MatrixBlock m1, MatrixBlock m2) {
- BitsetMatrix m1Map = new BitsetMatrix(m1);
+ BitsetMatrix m1Map = new BitsetMatrix1(m1);
BitsetMatrix m2Map = (m1 == m2) ? //self product
- m1Map : new BitsetMatrix(m2);
+ m1Map : new BitsetMatrix1(m2);
BitsetMatrix outMap = m1Map.matMult(m2Map);
return OptimizerUtils.getSparsity( // aggregate output histogram
outMap.getNumRows(), outMap.getNumColumns(), outMap.getNonZeros());
@@ -79,24 +79,17 @@ public class EstimatorBitsetMM extends SparsityEstimator {
throw new NotImplementedException();
}
- private static class BitsetMatrix {
- private final int _rlen;
- private final int _clen;
- private long _nonZeros;
- private BitSet[] _data;
-
+ private abstract static class BitsetMatrix {
+ protected final int _rlen;
+ protected final int _clen;
+ protected long _nonZeros;
+
public BitsetMatrix(int rlen, int clen) {
_rlen = rlen;
_clen = clen;
- _data = new BitSet[_rlen];
_nonZeros = 0;
}
-
- public BitsetMatrix(MatrixBlock in) {
- this(in.getNumRows(), in.getNumColumns());
- init(in);
- }
-
+
public int getNumRows() {
return _rlen;
}
@@ -108,49 +101,215 @@ public class EstimatorBitsetMM extends SparsityEstimator {
public long getNonZeros() {
return _nonZeros;
}
-
- private void init(MatrixBlock in) {
+
+ protected void init(MatrixBlock in) {
if (in.isEmptyBlock(false))
return;
if( MULTI_THREADED_BUILD && in.getNonZeros() > MIN_PAR_THRESHOLD ) {
- int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+ int k = 4 * InfrastructureAnalyzer.getLocalParallelism();
int blklen = (int)Math.ceil((double)_rlen/k);
IntStream.range(0, k).parallel().forEach(i ->
- buildIntern(this, in, i*blklen, Math.min((i+1)*blklen, _rlen)));
+ buildIntern(in, i*blklen, Math.min((i+1)*blklen, _rlen)));
}
else {
//single-threaded bitset construction
- buildIntern(this, in, 0, in.getNumRows());
+ buildIntern(in, 0, in.getNumRows());
}
_nonZeros = in.getNonZeros();
}
public BitsetMatrix matMult(BitsetMatrix m2) {
- BitsetMatrix out = new BitsetMatrix(_rlen, m2._clen);
+ BitsetMatrix out = createBitSetMatrix(_rlen, m2._clen);
if( this.getNonZeros() == 0 || m2.getNonZeros() == 0 )
return out;
long size = (long)_rlen*_clen+(long)m2._rlen*m2._clen;
if( MULTI_THREADED_ESTIM && size > MIN_PAR_THRESHOLD ) {
- int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+ int k = 4 * InfrastructureAnalyzer.getLocalParallelism();
int blklen = (int)Math.ceil((double)_rlen/k);
out._nonZeros = IntStream.range(0, k).parallel().mapToLong(i ->
- matMultIntern(this, m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
+ matMultIntern(m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
}
else {
//single-threaded boolean matrix mult
- out._nonZeros = matMultIntern(this, m2, out, 0, _rlen);
+ out._nonZeros = matMultIntern(m2, out, 0, _rlen);
}
return out;
}
- private static void buildIntern(BitsetMatrix bitset, MatrixBlock in, int rl, int ru) {
+ protected abstract BitsetMatrix createBitSetMatrix(int rlen, int clen);
+
+ protected abstract void buildIntern(MatrixBlock in, int rl, int ru);
+
+ protected abstract long matMultIntern(BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru);
+ }
+
+ /**
+ * This class represents a boolean matrix and provides key operations.
+ * In the interest of a cache-conscious matrix multiplication and reduced
+ * memory overhead, we use a linearized and padded array of longs instead
+ * of Java's BitSet per row (which causes memory overheads per row and does
+ * not allow for range ORs). However, this implies a maximum size of 16GB.
+ *
+ */
+ private static class BitsetMatrix1 extends BitsetMatrix {
+ //linearized and padded data array in row-major order, where each long
+ //represents 64 boolean values, all rows are aligned at 64 for simple access
+ private final int _rowLen;
+ private final long[] _data;
+
+ public BitsetMatrix1(int rlen, int clen) {
+ super(rlen, clen);
+ _rowLen = (int) Math.ceil((double)clen / 64);
+ _data = new long[rlen * _rowLen];
+ }
+
+ public BitsetMatrix1(MatrixBlock in) {
+ this(in.getNumRows(), in.getNumColumns());
+ init(in);
+ }
+
+ @Override
+ protected BitsetMatrix createBitSetMatrix(int rlen, int clen) {
+ return new BitsetMatrix1(rlen, clen);
+ }
+
+ @Override
+ protected void buildIntern(MatrixBlock in, int rl, int ru) {
+ if (in.isInSparseFormat()) {
+ SparseBlock sblock = in.getSparseBlock();
+ for (int i = rl; i < ru; i++) {
+ if (sblock.isEmpty(i))
+ continue;
+ int alen = sblock.size(i);
+ int apos = sblock.pos(i);
+ int[] aix = sblock.indexes(i);
+ for (int k = apos; k < apos + alen; k++)
+ set(i, aix[k]);
+ }
+ } else {
+ DenseBlock dblock = in.getDenseBlock();
+ for (int i = rl; i < ru; i++) {
+ double[] avals = dblock.values(i);
+ int aix = dblock.pos(i);
+ for (int j = 0; j < in.getNumColumns(); j++)
+ if (avals[aix + j] != 0)
+ set(i, j);
+ }
+ }
+ }
+
+ @Override
+ protected long matMultIntern(BitsetMatrix bsb2, BitsetMatrix bsc2, int rl, int ru) {
+ BitsetMatrix1 bsb = (BitsetMatrix1) bsb2;
+ BitsetMatrix1 bsc = (BitsetMatrix1) bsc2;
+ final long[] b = bsb._data;
+ final long[] c = bsc._data;
+ final int cd = _clen;
+ final int n = bsb._clen;
+ final int n64 = bsb._rowLen;
+
+ final int blocksizeI = 32;
+ final int blocksizeK = 24;
+ final int blocksizeJ = 1024 * 64;
+
+ long lnnz = 0;
+
+ //blocked execution (cache-conscious)
+ for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+ int bimin = Math.min(ru, bi+blocksizeI);
+ for( int bk = 0; bk < cd; bk+=blocksizeK ) {
+ int bkmin = Math.min(cd, bk+blocksizeK);
+ for( int bj = 0; bj < n; bj+=blocksizeJ ) {
+ //core sub block matrix multiplication
+ int bjlen64 = (int)Math.ceil((double)(Math.min(n, bj+blocksizeJ)-bj)/64);
+ int bj64 = bj/64;
+ for( int i = bi, off=i*_rowLen; i < bimin; i++, off+=_rowLen) {
+ for( int k = bk; k < bkmin; k++ ) {
+ if( getCol(off, k) ) //implicit and
+ or(b, c, k*n64+bj64, i*n64+bj64, bjlen64);
+ }
+ }
+ }
+ }
+ // maintain nnz for entire output row block
+ lnnz += card(c, bi*n64, (bimin-bi)*n64);
+ }
+
+ return lnnz;
+ }
+
+ private void set(int r, int c) {
+ int off = r * _rowLen;
+ int wordIndex = wordIndex(c); //see BitSet.java
+ _data[off + wordIndex] |= (1L << c); //see BitSet.java
+ }
+
+ @SuppressWarnings("unused")
+ private boolean get(int r, int c) {
+ int off = r * _rowLen;
+ int wordIndex = wordIndex(c); //see Bitset.java
+ return (_data[off + wordIndex] & (1L << c)) != 0; //see BitSet.java
+ }
+
+ private boolean getCol(int off, int c) {
+ int wordIndex = wordIndex(c); //see Bitset.java
+ return (_data[off + wordIndex] & (1L << c)) != 0; //see BitSet.java
+ }
+
+ private static int wordIndex(int bitIndex) {
+ return bitIndex >> 6; //see BitSet.java
+ }
+
+ private static int card(long[] c, int ci, int len) {
+ int sum = 0;
+ for( int i = ci; i < ci+len; i++ )
+ sum += Long.bitCount(c[i]);
+ return sum;
+ }
+
+ private static void or(long[] b, long[] c, int bi, int ci, int len) {
+ final int bn = len%8;
+ //compute rest
+ for( int i = 0; i < bn; i++, bi++, ci++ )
+ c[ci] |= b[bi];
+ //unrolled 8-block (for better instruction-level parallelism)
+ for( int i = bn; i < len; i+=8, bi+=8, ci+=8 ) {
+ c[ci+0] |= b[bi+0]; c[ci+1] |= b[bi+1];
+ c[ci+2] |= b[bi+2]; c[ci+3] |= b[bi+3];
+ c[ci+4] |= b[bi+4]; c[ci+5] |= b[bi+5];
+ c[ci+6] |= b[bi+4]; c[ci+7] |= b[bi+7];
+ }
+ }
+ }
+
+ @SuppressWarnings("unused")
+ private static class BitsetMatrix2 extends BitsetMatrix {
+ private BitSet[] _data;
+
+ public BitsetMatrix2(int rlen, int clen) {
+ super(rlen, clen);
+ _data = new BitSet[_rlen];
+ }
+
+ public BitsetMatrix2(MatrixBlock in) {
+ this(in.getNumRows(), in.getNumColumns());
+ init(in);
+ }
+
+ @Override
+ protected BitsetMatrix createBitSetMatrix(int rlen, int clen) {
+ return new BitsetMatrix2(rlen, clen);
+ }
+
+ @Override
+ protected void buildIntern(MatrixBlock in, int rl, int ru) {
final int clen = in.getNumColumns();
if (in.isInSparseFormat()) {
SparseBlock sblock = in.getSparseBlock();
for (int i = rl; i < ru; i++) {
if (sblock.isEmpty(i))
continue;
- BitSet lbs = bitset._data[i] = new BitSet(clen);
+ BitSet lbs = _data[i] = new BitSet(clen);
int alen = sblock.size(i);
int apos = sblock.pos(i);
int[] aix = sblock.indexes(i);
@@ -159,8 +318,8 @@ public class EstimatorBitsetMM extends SparsityEstimator {
}
} else {
DenseBlock dblock = in.getDenseBlock();
- for (int i = 0; i < in.getNumRows(); i++) {
- BitSet lbs = bitset._data[i] = new BitSet(clen);
+ for (int i = rl; i < ru; i++) {
+ BitSet lbs = _data[i] = new BitSet(clen);
double[] avals = dblock.values(i);
int aix = dblock.pos(i);
for (int j = 0; j < in.getNumColumns(); j++)
@@ -170,13 +329,16 @@ public class EstimatorBitsetMM extends SparsityEstimator {
}
}
- private static long matMultIntern(BitsetMatrix bsa, BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru) {
- final int cd = bsa._clen;
+ @Override
+ protected long matMultIntern(BitsetMatrix bsb2, BitsetMatrix bsc2, int rl, int ru) {
+ BitsetMatrix2 bsb = (BitsetMatrix2) bsb2;
+ BitsetMatrix2 bsc = (BitsetMatrix2) bsc2;
+ final int cd = _clen;
final int n = bsb._clen;
// matrix multiply with IKJ schedule and pure OR ops in inner loop
long lnnz = 0;
for (int i = rl; i < ru; i++) {
- BitSet a = bsa._data[i];
+ BitSet a = _data[i];
if( a != null ) {
BitSet c = bsc._data[i] = new BitSet(n);
for (int k = 0; k < cd; k++) {