You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/08/06 04:37:05 UTC
systemml git commit: [SYSTEMML-2486] Performance bitset sparsity
estimator (multi-threading)
Repository: systemml
Updated Branches:
refs/heads/master 78e9d836e -> 11f0291d7
[SYSTEMML-2486] Performance bitset sparsity estimator (multi-threading)
This patch extends the bitset sparsity estimator (based on boolean
matrix multiply) by optional multi-threaded construction and matrix
multiplication. Similar to floating point matrix multiplication C=AB, we
parallelize over row partitions in A and C with 4*k partitions to
mitigate load imbalance.
In a single node setup of an Intel Xeon Gold 6138 (40 pcores, 80
vcores), this patch improved the end-to-end estimation time (2x build,
1x matrixmult) as follows:
a) 10K x 10K, sp=0.01 (2x): 302ms -> 12.4ms.
b) 50K x 50K, sp=0.01 (2x): 30,531ms -> 1,372ms (baseline mm: 6,233ms).
With multi-threaded matrix mult, this kernel now became memory-bandwidth
bound with 79% LLC load misses. So far, this implementation is based on
Java's BitSet per row. For the sake of a cache-conscious implementation
(that requires range ORs), we should create our own tailor-made bit
matrix representation and operations.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/11f0291d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/11f0291d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/11f0291d
Branch: refs/heads/master
Commit: 11f0291d7537c7639241789b084385196024e45e
Parents: 78e9d83
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Aug 5 21:38:24 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Aug 5 21:38:47 2018 -0700
----------------------------------------------------------------------
.../sysml/hops/estim/EstimatorBitsetMM.java | 75 ++++++++++++++------
.../sysml/hops/estim/SparsityEstimator.java | 5 ++
2 files changed, 60 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 011975e..9bde8c8 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -20,9 +20,11 @@
package org.apache.sysml.hops.estim;
import java.util.BitSet;
+import java.util.stream.IntStream;
import org.apache.commons.lang.NotImplementedException;
import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.matrix.data.DenseBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
@@ -87,8 +89,6 @@ public class EstimatorBitsetMM extends SparsityEstimator {
_rlen = rlen;
_clen = clen;
_data = new BitSet[_rlen];
- for (int i = 0; i < _rlen; i++)
- _data[i] = new BitSet(_clen);
_nonZeros = 0;
}
@@ -112,12 +112,45 @@ public class EstimatorBitsetMM extends SparsityEstimator {
private void init(MatrixBlock in) {
if (in.isEmptyBlock(false))
return;
+ if( MULTI_THREADED_BUILD && in.getNonZeros() > MIN_PAR_THRESHOLD ) {
+ int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+ int blklen = (int)Math.ceil((double)_rlen/k);
+ IntStream.range(0, k).parallel().forEach(i ->
+ buildIntern(this, in, i*blklen, Math.min((i+1)*blklen, _rlen)));
+ }
+ else {
+ //single-threaded bitset construction
+ buildIntern(this, in, 0, in.getNumRows());
+ }
+ _nonZeros = in.getNonZeros();
+ }
+
+ public BitsetMatrix matMult(BitsetMatrix m2) {
+ BitsetMatrix out = new BitsetMatrix(_rlen, m2._clen);
+ if( this.getNonZeros() == 0 || m2.getNonZeros() == 0 )
+ return out;
+ long size = (long)_rlen*_clen+(long)m2._rlen*m2._clen;
+ if( MULTI_THREADED_ESTIM && size > MIN_PAR_THRESHOLD ) {
+ int k = 8 * InfrastructureAnalyzer.getLocalParallelism();
+ int blklen = (int)Math.ceil((double)_rlen/k);
+ out._nonZeros = IntStream.range(0, k).parallel().mapToLong(i ->
+ matMultIntern(this, m2, out, i*blklen, Math.min((i+1)*blklen, _rlen))).sum();
+ }
+ else {
+ //single-threaded boolean matrix mult
+ out._nonZeros = matMultIntern(this, m2, out, 0, _rlen);
+ }
+ return out;
+ }
+
+ private static void buildIntern(BitsetMatrix bitset, MatrixBlock in, int rl, int ru) {
+ final int clen = in.getNumColumns();
if (in.isInSparseFormat()) {
SparseBlock sblock = in.getSparseBlock();
- for (int i = 0; i < in.getNumRows(); i++) {
+ for (int i = rl; i < ru; i++) {
if (sblock.isEmpty(i))
continue;
- BitSet lbs = _data[i];
+ BitSet lbs = bitset._data[i] = new BitSet(clen);
int alen = sblock.size(i);
int apos = sblock.pos(i);
int[] aix = sblock.indexes(i);
@@ -127,7 +160,7 @@ public class EstimatorBitsetMM extends SparsityEstimator {
} else {
DenseBlock dblock = in.getDenseBlock();
for (int i = 0; i < in.getNumRows(); i++) {
- BitSet lbs = _data[i];
+ BitSet lbs = bitset._data[i] = new BitSet(clen);
double[] avals = dblock.values(i);
int aix = dblock.pos(i);
for (int j = 0; j < in.getNumColumns(); j++)
@@ -135,25 +168,27 @@ public class EstimatorBitsetMM extends SparsityEstimator {
lbs.set(j);
}
}
- _nonZeros = in.getNonZeros();
}
-
- public BitsetMatrix matMult(BitsetMatrix m2) {
- final int m = this._rlen;
- final int cd = this._clen;
- final int n = m2._clen;
+
+ private static long matMultIntern(BitsetMatrix bsa, BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru) {
+ final int cd = bsa._clen;
+ final int n = bsb._clen;
// matrix multiply with IKJ schedule and pure OR ops in inner loop
- BitsetMatrix out = new BitsetMatrix(m, n);
- for (int i = 0; i < m; i++) {
- BitSet a = this._data[i], c = out._data[i];
- for (int k = 0; k < cd; k++) {
- if (a.get(k))
- c.or(m2._data[k]);
+ long lnnz = 0;
+ for (int i = rl; i < ru; i++) {
+ BitSet a = bsa._data[i];
+ if( a != null ) {
+ BitSet c = bsc._data[i] = new BitSet(n);
+ for (int k = 0; k < cd; k++) {
+ BitSet b = bsb._data[k];
+ if (a.get(k) && b != null)
+ c.or(b);
+ }
+ // maintain nnz
+ lnnz += c.cardinality();
}
- // maintain nnz
- out._nonZeros += c.cardinality();
}
- return out;
+ return lnnz;
}
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
index 61a7207..411d9cd 100644
--- a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
+++ b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
@@ -27,6 +27,11 @@ public abstract class SparsityEstimator
{
protected static final Log LOG = LogFactory.getLog(SparsityEstimator.class.getName());
+ //internal configuration
+ public static boolean MULTI_THREADED_BUILD = false;
+ public static boolean MULTI_THREADED_ESTIM = false;
+ public static final int MIN_PAR_THRESHOLD = 10 * 1024;
+
public static enum OpCode {
MM,
MULT, PLUS, EQZERO, NEQZERO,