You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/07/12 05:27:56 UTC
systemml git commit: [SYSTEMML-1761] Performance wsloss w/o weights
(sparsity exploitation)
Repository: systemml
Updated Branches:
refs/heads/master b67f18641 -> 083cc77f8
[SYSTEMML-1761] Performance wsloss w/o weights (sparsity exploitation)
So far the wsloss operator w/o weights for sum((X-U%*%t(V))^2) was not
sparsity-exploiting due to the missing sparse driver (sparse matrix with
sparse-safe operation such as multiply or divide). However, this
expression can be rewritten into a sparsity-exploiting form with
sum((X-U%*%t(V))^2) -> sum(X^2) - sum(2*X*(U%*%t(V)) +
sum((t(U)%*%U)*(t(V)%*%V)).
This patch leverages leverages this rewrite for a much more efficient,
sparsity-exploiting and cache-conscious block-level implementation. The
performance improvements of the entire wsloss operation were as follows:
100K x 100K, sparsity=0.1, rank=100: 92.5s -> 8.5s
100K x 100K, sparsity=0.01, rank=100: 92.2s -> 1.3s
100K x 100K, sparsity=0.001, rank=100: 92.1s -> 0.4s
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/083cc77f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/083cc77f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/083cc77f
Branch: refs/heads/master
Commit: 083cc77f82b70748b70f8cc311fde040034bcc7c
Parents: b67f186
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Jul 11 21:41:20 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Jul 11 22:28:52 2017 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixMult.java | 126 +++++++++++++------
1 file changed, 88 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/083cc77f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 5996a51..da3b12b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -390,7 +390,7 @@ public class LibMatrixMult
//check no parallelization benefit (fallback to sequential)
//check too small workload in terms of flops (fallback to sequential too)
- if( ret.rlen == 1
+ if( ret.rlen == 1 || k <= 1
|| leftTranspose && 1L * m1.rlen * m1.clen * m1.clen < PAR_MINFLOP_THRESHOLD
|| !leftTranspose && 1L * m1.clen * m1.rlen * m1.rlen < PAR_MINFLOP_THRESHOLD)
{
@@ -533,6 +533,10 @@ public class LibMatrixMult
else
matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
+ //add correction for sparse wsloss w/o weight
+ if( mX.sparse && wt==WeightsType.NONE )
+ addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, 1);
+
//System.out.println("MMWSLoss " +wt.toString()+ " ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
@@ -572,6 +576,10 @@ public class LibMatrixMult
throw new DMLRuntimeException(e);
}
+ //add correction for sparse wsloss w/o weight
+ if( mX.sparse && wt==WeightsType.NONE )
+ addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
+
//System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
@@ -2163,36 +2171,34 @@ public class LibMatrixMult
// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
else if( wt==WeightsType.NONE )
{
- // approach: iterate over all cells of X and
- for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
- {
- if( x.isEmpty(i) ) { //empty row
- for( int j=0, vix=0; j<n; j++, vix+=cd) {
- double uvij = dotProduct(u, v, uix, vix, cd);
- wsloss += (-uvij)*(-uvij);
- }
- }
- else { //non-empty row
- int xpos = x.pos(i);
- int xlen = x.size(i);
- int[] xix = x.indexes(i);
- double[] xval = x.values(i);
- int last = -1;
- for( int k=xpos; k<xpos+xlen; k++ ) {
- //process last nnz til current nnz
- for( int k2=last+1; k2<xix[k]; k2++ ){
- double uvij = dotProduct(u, v, uix, k2*cd, cd);
- wsloss += (-uvij)*(-uvij);
+ //approach: use sparsity-exploiting pattern rewrite sum((X-(U%*%t(V)))^2)
+ //-> sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
+ //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
+ //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
+
+ final int blocksizeIJ = (int) (8L*mX.rlen*mX.clen/mX.nonZeros);
+ int[] curk = new int[blocksizeIJ];
+
+ for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
+ int bimin = Math.min(ru, bi+blocksizeIJ);
+ //prepare starting indexes for block row
+ Arrays.fill(curk, 0);
+ //blocked execution over column blocks
+ for( int bj=0; bj<n; bj+=blocksizeIJ ) {
+ int bjmin = Math.min(n, bj+blocksizeIJ);
+ for( int i=bi, uix=bi*cd; i<bimin; i++, uix+=cd ) {
+ if( x.isEmpty(i) ) continue;
+ int xpos = x.pos(i);
+ int xlen = x.size(i);
+ int[] xix = x.indexes(i);
+ double[] xval = x.values(i);
+ int k = xpos + curk[i-bi];
+ for( ; k<xpos+xlen && xix[k]<bjmin; k++ ) {
+ double xij = xval[k];
+ double uvij = dotProduct(u, v, uix, xix[k]*cd, cd);
+ wsloss += xij * xij - 2 * xij * uvij;
}
- //process current nnz
- double uvij = dotProduct(u, v, uix, xix[k]*cd, cd);
- wsloss += (xval[k]-uvij)*(xval[k]-uvij);
- last = xix[k];
- }
- //process last nnz til end of row
- for( int k2=last+1; k2<n; k2++ ) {
- double uvij = dotProduct(u, v, uix, k2*cd, cd);
- wsloss += (-uvij)*(-uvij);
+ curk[i-bi] = k - xpos;
}
}
}
@@ -2291,18 +2297,52 @@ public class LibMatrixMult
// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
else if( wt==WeightsType.NONE )
{
- // approach: iterate over all cells of X and
- for( int i=rl; i<ru; i++ )
- for( int j=0; j<n; j++)
- {
- double xij = mX.quickGetValue(i, j);
- double uvij = dotProductGeneric(mU, mV, i, j, cd);
- wsloss += (xij-uvij)*(xij-uvij);
- }
+ //approach: use sparsity-exploiting pattern rewrite sum((X-(U%*%t(V)))^2)
+ //-> sum(X^2)-sum(2*X*(U%*%t(V))))+sum((t(U)%*%U)*(t(V)%*%V)), where each
+ //parallel task computes sum(X^2)-sum(2*X*(U%*%t(V)))) and the last term
+ //sum((t(U)%*%U)*(t(V)%*%V)) is computed once via two tsmm operations.
+
+ if( mW.sparse ) { //SPARSE
+ SparseBlock x = mX.sparseBlock;
+ for( int i=rl; i<ru; i++ ) {
+ if( x.isEmpty(i) ) continue;
+ int xpos = x.pos(i);
+ int xlen = x.size(i);
+ int[] xix = x.indexes(i);
+ double[] xval = x.values(i);
+ for( int k=xpos; k<xpos+xlen; k++ ) {
+ double xij = xval[k];
+ double uvij = dotProductGeneric(mU, mV, i, xix[k], cd);
+ wsloss += xij * xij - 2 * xij * uvij;
+ }
+ }
+ }
+ else { //DENSE
+ double[] x = mX.denseBlock;
+ for( int i=rl, xix=rl*n; i<ru; i++, xix+=n )
+ for( int j=0; j<n; j++)
+ if( x[xix+j] != 0 ) {
+ double xij = x[xix+j];
+ double uvij = dotProductGeneric(mU, mV, i, j, cd);
+ wsloss += xij * xij - 2 * xij * uvij;
+ }
+ }
}
ret.quickSetValue(0, 0, wsloss);
}
+
+ private static void addMatrixMultWSLossNoWeightCorrection(MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, int k)
+ throws DMLRuntimeException
+ {
+ MatrixBlock tmp1 = new MatrixBlock(mU.clen, mU.clen, false);
+ MatrixBlock tmp2 = new MatrixBlock(mU.clen, mU.clen, false);
+ matrixMultTransposeSelf(mU, tmp1, true, k);
+ matrixMultTransposeSelf(mV, tmp2, true, k);
+ ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) +
+ ((tmp1.sparse || tmp2.sparse) ? dotProductGeneric(tmp1, tmp2) :
+ dotProduct(tmp1.denseBlock, tmp2.denseBlock, mU.clen*mU.clen)));
+ }
private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru)
throws DMLRuntimeException
@@ -3405,6 +3445,16 @@ public class LibMatrixMult
return val;
}
+ private static double dotProductGeneric(MatrixBlock a, MatrixBlock b)
+ {
+ double val = 0;
+ for( int i=0; i<a.getNumRows(); i++ )
+ for( int j=0; j<a.getNumColumns(); j++ )
+ val += a.quickGetValue(i, j) * b.quickGetValue(i, j);
+
+ return val;
+ }
+
/**
* Used for all version of TSMM where the result is known to be symmetric.
* Hence, we compute only the upper triangular matrix and copy this partial