You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/08/06 04:37:05 UTC

systemml git commit: [SYSTEMML-2486] Performance bitset sparsity estimator (multi-threading)

Repository: systemml
Updated Branches:
  refs/heads/master 78e9d836e -> 11f0291d7


[SYSTEMML-2486] Performance bitset sparsity estimator (multi-threading)

This patch extends the bitset sparsity estimator (based on boolean
matrix multiply) by optional multi-threaded construction and matrix
multiplication. Similar to floating point matrix multiplication C=AB, we
parallelize over row partitions in A and C with 4*k partitions to
mitigate load imbalance.

In a single node setup of an Intel Xeon Gold 6138 (40 pcores, 80
vcores), this patch improved the end-to-end estimation time (2x build,
1x matrixmult) as follows:

a) 10K x 10K, sp=0.01 (2x): 302ms -> 12.4ms.
b) 50K x 50K, sp=0.01 (2x): 30,531ms -> 1,372ms (baseline mm: 6,233ms).

With multi-threaded matrix mult, this kernel now became memory-bandwidth
bound with 79% LLC load misses. So far, this implementation is based on
Java's BitSet per row. For the sake of a cache-conscious implementation
(that requires range ORs), we should create our own tailor-made bit
matrix representation and operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/11f0291d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/11f0291d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/11f0291d

Branch: refs/heads/master
Commit: 11f0291d7537c7639241789b084385196024e45e
Parents: 78e9d83
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Aug 5 21:38:24 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Aug 5 21:38:47 2018 -0700

----------------------------------------------------------------------
 .../sysml/hops/estim/EstimatorBitsetMM.java     | 75 ++++++++++++++------
 .../sysml/hops/estim/SparsityEstimator.java     |  5 ++
 2 files changed, 60 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 011975e..9bde8c8 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -20,9 +20,11 @@
 package org.apache.sysml.hops.estim;
 
 import java.util.BitSet;
+import java.util.stream.IntStream;
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.matrix.data.DenseBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
@@ -87,8 +89,6 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 			_rlen = rlen;
 			_clen = clen;
 			_data = new BitSet[_rlen];
-			for (int i = 0; i < _rlen; i++)
-				_data[i] = new BitSet(_clen);
 			_nonZeros = 0;
 		}
 
@@ -112,12 +112,45 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 		private void init(MatrixBlock in) {
 			if (in.isEmptyBlock(false))
 				return;
+			if( MULTI_THREADED_BUILD && in.getNonZeros() > MIN_PAR_THRESHOLD ) {
+				int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+				int blklen = (int)Math.ceil((double)_rlen/k);
+				IntStream.range(0, k).parallel().forEach(i ->
+					buildIntern(this, in, i*blklen, Math.min((i+1)*blklen, _rlen)));
+			}
+			else {
+				//single-threaded bitset construction
+				buildIntern(this, in, 0, in.getNumRows());
+			}
+			_nonZeros = in.getNonZeros();
+		}
+
+		public BitsetMatrix matMult(BitsetMatrix m2) {
+			BitsetMatrix out = new BitsetMatrix(_rlen, m2._clen);
+			if( this.getNonZeros() == 0 || m2.getNonZeros() == 0 )
+				return out;
+			long size = (long)_rlen*_clen+(long)m2._rlen*m2._clen;
+			if( MULTI_THREADED_ESTIM && size > MIN_PAR_THRESHOLD ) {
+				int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+				int blklen = (int)Math.ceil((double)_rlen/k);
+				out._nonZeros = IntStream.range(0, k).parallel().mapToLong(i ->
+					matMultIntern(this, m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
+			}
+			else {
+				//single-threaded boolean matrix mult
+				out._nonZeros = matMultIntern(this, m2, out, 0, _rlen);
+			}
+			return out;
+		}
+		
+		private static void buildIntern(BitsetMatrix bitset, MatrixBlock in, int rl, int ru) {
+			final int clen = in.getNumColumns();
 			if (in.isInSparseFormat()) {
 				SparseBlock sblock = in.getSparseBlock();
-				for (int i = 0; i < in.getNumRows(); i++) {
+				for (int i = rl; i < ru; i++) {
 					if (sblock.isEmpty(i))
 						continue;
-					BitSet lbs = _data[i];
+					BitSet lbs = bitset._data[i] = new BitSet(clen);
 					int alen = sblock.size(i);
 					int apos = sblock.pos(i);
 					int[] aix = sblock.indexes(i);
@@ -127,7 +160,7 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 			} else {
 				DenseBlock dblock = in.getDenseBlock();
 				for (int i = 0; i < in.getNumRows(); i++) {
-					BitSet lbs = _data[i];
+					BitSet lbs = bitset._data[i] = new BitSet(clen);
 					double[] avals = dblock.values(i);
 					int aix = dblock.pos(i);
 					for (int j = 0; j < in.getNumColumns(); j++)
@@ -135,25 +168,27 @@ public class EstimatorBitsetMM extends SparsityEstimator {
 							lbs.set(j);
 				}
 			}
-			_nonZeros = in.getNonZeros();
 		}
-
-		public BitsetMatrix matMult(BitsetMatrix m2) {
-			final int m = this._rlen;
-			final int cd = this._clen;
-			final int n = m2._clen;
+		
+		private static long matMultIntern(BitsetMatrix bsa, BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru) {
+			final int cd = bsa._clen;
+			final int n = bsb._clen;
 			// matrix multiply with IKJ schedule and pure OR ops in inner loop
-			BitsetMatrix out = new BitsetMatrix(m, n);
-			for (int i = 0; i < m; i++) {
-				BitSet a = this._data[i], c = out._data[i];
-				for (int k = 0; k < cd; k++) {
-					if (a.get(k))
-						c.or(m2._data[k]);
+			long lnnz = 0;
+			for (int i = rl; i < ru; i++) {
+				BitSet a = bsa._data[i];
+				if( a != null ) {
+					BitSet c = bsc._data[i] = new BitSet(n);
+					for (int k = 0; k < cd; k++) {
+						BitSet b = bsb._data[k];
+						if (a.get(k) && b != null)
+							c.or(b);
+					}
+					// maintain nnz
+					lnnz += c.cardinality();
 				}
-				// maintain nnz
-				out._nonZeros += c.cardinality();
 			}
-			return out;
+			return lnnz;
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
index 61a7207..411d9cd 100644
--- a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
+++ b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
@@ -27,6 +27,11 @@ public abstract class SparsityEstimator
 {
 	protected static final Log LOG = LogFactory.getLog(SparsityEstimator.class.getName());
 	
+	//internal configuration
+	public static boolean MULTI_THREADED_BUILD = false;
+	public static boolean MULTI_THREADED_ESTIM = false;
+	public static final int MIN_PAR_THRESHOLD = 10 * 1024;
+	
 	public static enum OpCode {
 		MM, 
 		MULT, PLUS, EQZERO, NEQZERO,