You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/15 09:52:53 UTC
[2/3] systemml git commit: [MINOR] Refactoring lib matrixmult/bincell
(instruction footprint)
[MINOR] Refactoring lib matrixmult/bincell (instruction footprint)
This patch makes a minor refactoring of the libraries for matrix
multiplications and binary cell-wise operations in order to reduce the
instruction footprint and simplify JIT compilation. Specifically, the
methods for dense-dense mm, sparse-dense mm, and safe binary operations
have been split into methods for the major individual cases.
On an end-to-end cnn application, this patch reduced the number of
L1-icache misses from 6,055,257,066 to 5,663,272,573 and the number of
iTLB misses from 289,601,812 to 161,268,707.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a347af3b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a347af3b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a347af3b
Branch: refs/heads/master
Commit: a347af3b7b488ac3b296b9b9692f7172d60ac6f5
Parents: 17c5d5a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:22:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:22:55 2017 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixBincell.java | 446 +++++++------
.../runtime/matrix/data/LibMatrixMult.java | 665 ++++++++++---------
2 files changed, 580 insertions(+), 531 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 7622137..2878b3b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -210,12 +210,9 @@ public class LibMatrixBincell
{
return;
}
-
- int rlen = m1.rlen;
- int clen = m1.clen;
- BinaryAccessType atype = getBinaryAccessType(m1, m2);
- if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
+ BinaryAccessType atype = getBinaryAccessType(m1, m2);
+ if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
|| atype == BinaryAccessType.MATRIX_ROW_VECTOR)
{
//note: m2 vector and hence always dense
@@ -232,213 +229,24 @@ public class LibMatrixBincell
}
else //MATRIX - MATRIX
{
- if(m1.sparse && m2.sparse)
- {
- if(ret.sparse)
- ret.allocateSparseRowsBlock();
-
- //both sparse blocks existing
- if(m1.sparseBlock!=null && m2.sparseBlock!=null)
- {
- SparseBlock lsblock = m1.sparseBlock;
- SparseBlock rsblock = m2.sparseBlock;
-
- if( ret.sparse && lsblock.isAligned(rsblock) )
- {
- SparseBlock c = ret.sparseBlock;
- for(int r=0; r<rlen; r++)
- if( !lsblock.isEmpty(r) ) {
- int alen = lsblock.size(r);
- int apos = lsblock.pos(r);
- int[] aix = lsblock.indexes(r);
- double[] avals = lsblock.values(r);
- double[] bvals = rsblock.values(r);
- c.allocate(r, alen);
- for( int j=apos; j<apos+alen; j++ ) {
- double tmp = op.fn.execute(avals[j], bvals[j]);
- c.append(r, aix[j], tmp);
- }
- ret.nonZeros += c.size(r);
- }
- }
- else //general case
- {
- for(int r=0; r<rlen; r++)
- {
- if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
- mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
- rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
- }
- else if( !rsblock.isEmpty(r) ) {
- appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
- rsblock.pos(r), rsblock.size(r), 0, r, ret);
- }
- else if( !lsblock.isEmpty(r) ){
- appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
- lsblock.pos(r), lsblock.size(r), 0, r, ret);
- }
- // do nothing if both not existing
- }
- }
- }
- //right sparse block existing
- else if( m2.sparseBlock!=null )
- {
- SparseBlock rsblock = m2.sparseBlock;
-
- for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++)
- if( !rsblock.isEmpty(r) )
- {
- appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
- rsblock.pos(r), rsblock.size(r), 0, r, ret);
- }
- }
- //left sparse block existing
- else
- {
- SparseBlock lsblock = m1.sparseBlock;
-
- for(int r=0; r<rlen; r++)
- if( !lsblock.isEmpty(r) )
- {
- appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
- lsblock.pos(r), lsblock.size(r), 0, r, ret);
- }
- }
+ if(m1.sparse && m2.sparse) {
+ safeBinaryMMSparseSparse(m1, m2, ret, op);
}
else if( !ret.sparse && (m1.sparse || m2.sparse) &&
- (op.fn instanceof Plus || op.fn instanceof Minus ||
- op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
- (op.fn instanceof Multiply && !m2.sparse )))
- {
- //specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
- ret.allocateDenseBlock();
- final int m = ret.rlen;
- final int n = ret.clen;
- double[] c = ret.denseBlock;
-
- //1) process left input: assignment
-
- if( m1.sparse ) //SPARSE left
- {
- if( m1.sparseBlock != null )
- {
- SparseBlock a = m1.sparseBlock;
-
- for( int i=0, ix=0; i<m; i++, ix+=n ) {
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- for(int k = apos; k < apos+alen; k++)
- c[ix+aix[k]] = avals[k];
- }
- }
- }
- }
- else //DENSE left
- {
- if( !m1.isEmptyBlock(false) )
- System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
- else
- Arrays.fill(ret.denseBlock, 0, m*n, 0);
- }
-
- //2) process right input: op.fn (+,-,*), * only if dense
- long lnnz = 0;
- if( m2.sparse ) //SPARSE right
- {
- if(m2.sparseBlock!=null)
- {
- SparseBlock a = m2.sparseBlock;
-
- for( int i=0, ix=0; i<m; i++, ix+=n ) {
- if( !a.isEmpty(i) ) {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- for(int k = apos; k < apos+alen; k++)
- c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
- }
- //exploit temporal locality of rows
- lnnz += ret.recomputeNonZeros(i, i, 0, clen-1);
- }
- }
- }
- else //DENSE right
- {
- if( !m2.isEmptyBlock(false) ) {
- double[] a = m2.denseBlock;
- for( int i=0; i<m*n; i++ ) {
- c[i] = op.fn.execute(c[i], a[i]);
- lnnz += (c[i]!=0) ? 1 : 0;
- }
- }
- else if(op.fn instanceof Multiply)
- Arrays.fill(ret.denseBlock, 0, m*n, 0);
- }
-
- //3) recompute nnz
- ret.setNonZeros(lnnz);
+ (op.fn instanceof Plus || op.fn instanceof Minus ||
+ op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
+ (op.fn instanceof Multiply && !m2.sparse ))) {
+ safeBinaryMMSparseDenseDense(m1, m2, ret, op);
}
else if( !ret.sparse && !m1.sparse && !m2.sparse
- && m1.denseBlock!=null && m2.denseBlock!=null )
- {
- ret.allocateDenseBlock();
- final int m = ret.rlen;
- final int n = ret.clen;
- double[] a = m1.denseBlock;
- double[] b = m2.denseBlock;
- double[] c = ret.denseBlock;
- ValueFunction fn = op.fn;
-
- //compute dense-dense binary, maintain nnz on-the-fly
- int lnnz = 0;
- for( int i=0; i<m*n; i++ ) {
- c[i] = fn.execute(a[i], b[i]);
- lnnz += (c[i]!=0)? 1 : 0;
- }
- ret.setNonZeros(lnnz);
+ && m1.denseBlock!=null && m2.denseBlock!=null ) {
+ safeBinaryMMDenseDenseDense(m1, m2, ret, op);
}
- else if( skipEmpty && (m1.sparse || m2.sparse) )
- {
- SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
- if( a == null )
- return;
-
- //prepare second input and allocate output
- MatrixBlock b = m1.sparse ? m2 : m1;
- ret.allocateBlock();
-
- for( int i=0; i<a.numRows(); i++ ) {
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- if( ret.sparse && !b.sparse )
- ret.sparseBlock.allocate(i, alen);
- for(int k = apos; k < apos+alen; k++) {
- double in2 = b.quickGetValue(i, aix[k]);
- if( in2==0 ) continue;
- double val = op.fn.execute(avals[k], in2);
- ret.appendValue(i, aix[k], val);
- }
- }
+ else if( skipEmpty && (m1.sparse || m2.sparse) ) {
+ safeBinaryMMSparseDenseSkip(m1, m2, ret, op);
}
- else //generic case
- {
- for(int r=0; r<rlen; r++)
- for(int c=0; c<clen; c++) {
- double in1 = m1.quickGetValue(r, c);
- double in2 = m2.quickGetValue(r, c);
- if( in1==0 && in2==0) continue;
- double val = op.fn.execute(in1, in2);
- ret.appendValue(r, c, val);
- }
+ else { //generic case
+ safeBinaryMMGeneric(m1, m2, ret, op);
}
}
}
@@ -694,7 +502,7 @@ public class LibMatrixBincell
for( int j=0; j<blen; j++ ) {
double v1 = m1.quickGetValue(i, bix[j]);
double v = op.fn.execute( v1, bvals[j] );
- ret.appendValue(i, bix[j], v);
+ ret.appendValue(i, bix[j], v);
}
}
}
@@ -731,19 +539,231 @@ public class LibMatrixBincell
}
else {
for(int r=0; r<rlen; r++) {
- double v1 = m1.quickGetValue(r, 0);
+ double v1 = m1.quickGetValue(r, 0);
for(int c=0; c<clen; c++)
{
double v2 = m2.quickGetValue(0, c);
double v = op.fn.execute( v1, v2 );
- ret.appendValue(r, c, v);
+ ret.appendValue(r, c, v);
}
- }
- }
-
+ }
+ }
+
//no need to recomputeNonZeros since maintained in append value
}
+ private static void safeBinaryMMSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ final int rlen = m1.rlen;
+ if(ret.sparse)
+ ret.allocateSparseRowsBlock();
+
+ //both sparse blocks existing
+ if(m1.sparseBlock!=null && m2.sparseBlock!=null)
+ {
+ SparseBlock lsblock = m1.sparseBlock;
+ SparseBlock rsblock = m2.sparseBlock;
+
+ if( ret.sparse && lsblock.isAligned(rsblock) )
+ {
+ SparseBlock c = ret.sparseBlock;
+ for(int r=0; r<rlen; r++)
+ if( !lsblock.isEmpty(r) ) {
+ int alen = lsblock.size(r);
+ int apos = lsblock.pos(r);
+ int[] aix = lsblock.indexes(r);
+ double[] avals = lsblock.values(r);
+ double[] bvals = rsblock.values(r);
+ c.allocate(r, alen);
+ for( int j=apos; j<apos+alen; j++ ) {
+ double tmp = op.fn.execute(avals[j], bvals[j]);
+ c.append(r, aix[j], tmp);
+ }
+ ret.nonZeros += c.size(r);
+ }
+ }
+ else //general case
+ {
+ for(int r=0; r<rlen; r++) {
+ if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
+ mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
+ rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
+ }
+ else if( !rsblock.isEmpty(r) ) {
+ appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
+ rsblock.pos(r), rsblock.size(r), 0, r, ret);
+ }
+ else if( !lsblock.isEmpty(r) ){
+ appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
+ lsblock.pos(r), lsblock.size(r), 0, r, ret);
+ }
+ // do nothing if both not existing
+ }
+ }
+ }
+ //right sparse block existing
+ else if( m2.sparseBlock!=null )
+ {
+ SparseBlock rsblock = m2.sparseBlock;
+ for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++) {
+ if( rsblock.isEmpty(r) ) continue;
+ appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
+ rsblock.pos(r), rsblock.size(r), 0, r, ret);
+ }
+ }
+ //left sparse block existing
+ else
+ {
+ SparseBlock lsblock = m1.sparseBlock;
+ for(int r=0; r<rlen; r++) {
+ if( lsblock.isEmpty(r) ) continue;
+ appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
+ lsblock.pos(r), lsblock.size(r), 0, r, ret);
+ }
+ }
+ }
+
+ private static void safeBinaryMMSparseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ //specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
+ ret.allocateDenseBlock();
+ final int m = ret.rlen;
+ final int n = ret.clen;
+ double[] c = ret.denseBlock;
+
+ //1) process left input: assignment
+
+ if( m1.sparse ) //SPARSE left
+ {
+ if( m1.sparseBlock != null )
+ {
+ SparseBlock a = m1.sparseBlock;
+
+ for( int i=0, ix=0; i<m; i++, ix+=n ) {
+ if( !a.isEmpty(i) )
+ {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++)
+ c[ix+aix[k]] = avals[k];
+ }
+ }
+ }
+ }
+ else //DENSE left
+ {
+ if( !m1.isEmptyBlock(false) )
+ System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
+ else
+ Arrays.fill(ret.denseBlock, 0, m*n, 0);
+ }
+
+ //2) process right input: op.fn (+,-,*), * only if dense
+ long lnnz = 0;
+ if( m2.sparse ) //SPARSE right
+ {
+ if(m2.sparseBlock!=null)
+ {
+ SparseBlock a = m2.sparseBlock;
+
+ for( int i=0, ix=0; i<m; i++, ix+=n ) {
+ if( !a.isEmpty(i) ) {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++)
+ c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
+ }
+ //exploit temporal locality of rows
+ lnnz += ret.recomputeNonZeros(i, i, 0, n-1);
+ }
+ }
+ }
+ else //DENSE right
+ {
+ if( !m2.isEmptyBlock(false) ) {
+ double[] a = m2.denseBlock;
+ for( int i=0; i<m*n; i++ ) {
+ c[i] = op.fn.execute(c[i], a[i]);
+ lnnz += (c[i]!=0) ? 1 : 0;
+ }
+ }
+ else if(op.fn instanceof Multiply)
+ Arrays.fill(ret.denseBlock, 0, m*n, 0);
+ }
+
+ //3) recompute nnz
+ ret.setNonZeros(lnnz);
+ }
+
+ private static void safeBinaryMMDenseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ ret.allocateDenseBlock();
+ final int m = ret.rlen;
+ final int n = ret.clen;
+ double[] a = m1.denseBlock;
+ double[] b = m2.denseBlock;
+ double[] c = ret.denseBlock;
+ ValueFunction fn = op.fn;
+
+ //compute dense-dense binary, maintain nnz on-the-fly
+ int lnnz = 0;
+ for( int i=0; i<m*n; i++ ) {
+ c[i] = fn.execute(a[i], b[i]);
+ lnnz += (c[i]!=0)? 1 : 0;
+ }
+ ret.setNonZeros(lnnz);
+ }
+
+ private static void safeBinaryMMSparseDenseSkip(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
+ if( a == null )
+ return;
+
+ //prepare second input and allocate output
+ MatrixBlock b = m1.sparse ? m2 : m1;
+ ret.allocateBlock();
+
+ for( int i=0; i<a.numRows(); i++ ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ if( ret.sparse && !b.sparse )
+ ret.sparseBlock.allocate(i, alen);
+ for(int k = apos; k < apos+alen; k++) {
+ double in2 = b.quickGetValue(i, aix[k]);
+ if( in2==0 ) continue;
+ double val = op.fn.execute(avals[k], in2);
+ ret.appendValue(i, aix[k], val);
+ }
+ }
+ }
+
+ private static void safeBinaryMMGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ int rlen = m1.rlen;
+ int clen = m2.clen;
+ for(int r=0; r<rlen; r++)
+ for(int c=0; c<clen; c++) {
+ double in1 = m1.quickGetValue(r, c);
+ double in2 = m2.quickGetValue(r, c);
+ if( in1==0 && in2==0) continue;
+ double val = op.fn.execute(in1, in2);
+ ret.appendValue(r, c, val);
+ }
+ }
+
/**
*
* This will do cell wise operation for <, <=, >, >=, == and != operators.
@@ -1254,6 +1274,4 @@ public class LibMatrixBincell
result.appendValue(resultRow, cols2[j], v);
}
}
-
}
-
http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fee73c5..eca26f6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -632,7 +632,7 @@ public class LibMatrixMult
ret.allocateBlock();
try
- {
+ {
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -927,169 +927,189 @@ public class LibMatrixMult
private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
- {
+ {
double[] a = m1.denseBlock;
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
final int n = m2.clen;
final int cd = m1.clen;
-
- if( LOW_LEVEL_OPTIMIZATION )
- {
- if( m==1 && n==1 ) //DOT PRODUCT
- {
+
+ if( LOW_LEVEL_OPTIMIZATION ) {
+ if( m==1 && n==1 ) { //DOT PRODUCT
c[0] = dotProduct(a, b, cd);
}
- else if( n>1 && cd == 1 ) //OUTER PRODUCT
- {
+ else if( n>1 && cd == 1 ) { //OUTER PRODUCT
for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
if( a[i] == 1 )
System.arraycopy(b, 0, c, cix, n);
- else if( a[i] != 0 )
+ else if( a[i] != 0 )
vectMultiplyWrite(a[i], b, c, 0, cix, n);
else
Arrays.fill(c, cix, cix+n, 0);
}
}
- else if( n==1 && cd == 1 ) //VECTOR-SCALAR
- {
+ else if( n==1 && cd == 1 ) { //VECTOR-SCALAR
vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
}
- else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
- {
- for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
- c[i] = dotProduct(a, b, aix, 0, cd);
+ else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+ matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
}
- else if( n==1 ) //MATRIX-VECTOR (tall rhs)
- {
- final int blocksizeI = 32;
- final int blocksizeK = 2*1024; //16KB vector blocks (L1)
- for( int bi=rl; bi<ru; bi+=blocksizeI ) {
- int bimin = Math.min(bi+blocksizeI, ru);
- for( int bk=0; bk<cd; bk+=blocksizeK ) {
- int bkmin = Math.min(bk+blocksizeK, cd);
- for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd)
- c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
- }
- }
+ else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
+ matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
}
- else if( pm2 && m==1 ) //VECTOR-MATRIX
- {
- //parallelization over rows in rhs matrix
- //rest not aligned to blocks of 2 rows
- final int kn = (ru-rl)%2;
- if( kn == 1 && a[rl] != 0 )
- vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-
- //compute blocks of 2 rows (2 instead of 4 for small n<64)
- for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
- if( a[k] != 0 && a[k+1] != 0 )
- vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
- else if( a[k] != 0 )
- vectMultiplyAdd(a[k], b, c, bix, 0, n);
- else if( a[k+1] != 0 )
- vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
- }
+ else if( pm2 && m==1 ) { //VECTOR-MATRIX
+ matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
}
- else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
- {
- //cache-conscious parallelization over rows in rhs matrix
- final int kn = (ru-rl)%4;
-
- //rest not aligned to blocks of 2 rows
- for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
- for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
- if( a[aix+k] != 0 )
- vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
-
- final int blocksizeK = 48;
- final int blocksizeJ = 1024;
-
- //blocked execution
- for( int bk = rl+kn; bk < ru; bk+=blocksizeK )
- for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ )
- {
- //compute blocks of 4 rows in rhs w/ IKJ
- int bjlen = Math.min(n, bj+blocksizeJ)-bj;
- for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
- for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
- vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
- b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
- }
- }
+ else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs)
+ matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
}
- else if( tm2 ) //MATRIX-MATRIX (skinny rhs)
- {
- //note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
- //however, explicit flag required since dimension change m2
- final int n2 = m2.rlen;
- for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 )
- for( int j=0, bix=0; j<n2; j++, bix+=cd )
- c[cix+j] = dotProduct(a, b, aix, bix, cd);
- }
- else //MATRIX-MATRIX
- {
- //1) Unrolled inner loop (for better instruction-level parallelism)
- //2) Blocked execution (for less cache trashing in parallel exec)
- //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-
- final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
- final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
- final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
-
- //temporary arrays (nnz a, b index)
- double[] ta = new double[ blocksizeK ];
- int[] tbi = new int[ blocksizeK ];
-
- //blocked execution
- for( int bi = rl; bi < ru; bi+=blocksizeI )
- for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
- for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ )
- {
- int bklen = bkmin-bk;
- int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-
- //core sub block matrix multiplication
- for( int i = bi; i < bimin; i++)
- {
- int aixi = i * cd + bk; //start index on a
- int cixj = i * n + bj; //scan index on c
-
- //determine nnz of a (for sparsity-aware skipping of rows)
- int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
- //if( knnz > 0 ) //for skipping empty rows
-
- //rest not aligned to blocks of 4 rows
- final int bn = knnz % 4;
- switch( bn ){
- case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
- case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
- case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
- }
-
- //compute blocks of 4 rows (core inner loop)
- for( int k = bn; k<knnz; k+=4 ){
- vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c,
- tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
- }
- }
- }
+ else if( tm2 ) { //MATRIX-MATRIX (skinny rhs)
+ matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
+ }
+ else { //MATRIX-MATRIX
+ matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
}
}
- else
- {
- double val;
+ else {
for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n)
- for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
- {
- val = a[ aix ];
+ for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n) {
+ double val = a[ aix ];
if( val != 0 )
for( int j = 0; j < n; j++)
c[ cix+j ] += val * b[ bix+j ];
- }
+ }
}
+ }
+
+ private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
+ c[i] = dotProduct(a, b, aix, 0, cd);
+ }
+
+ private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ final int blocksizeI = 32;
+ final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+ for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+ int bimin = Math.min(bi+blocksizeI, ru);
+ for( int bk=0; bk<cd; bk+=blocksizeK ) {
+ int bkmin = Math.min(bk+blocksizeK, cd);
+ for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd)
+ c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+ }
+ }
+ }
+
+ private static void matrixMultDenseDenseVM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //parallelization over rows in rhs matrix
+ //rest not aligned to blocks of 2 rows
+ final int kn = (ru-rl)%2;
+ if( kn == 1 && a[rl] != 0 )
+ vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
+ //compute blocks of 2 rows (2 instead of 4 for small n<64)
+ for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
+ if( a[k] != 0 && a[k+1] != 0 )
+ vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
+ else if( a[k] != 0 )
+ vectMultiplyAdd(a[k], b, c, bix, 0, n);
+ else if( a[k+1] != 0 )
+ vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+ }
+ }
+
+ private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] b, double[] c, int m, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //cache-conscious parallelization over rows in rhs matrix
+ final int kn = (ru-rl)%4;
+
+ //rest not aligned to blocks of 2 rows
+ for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
+ for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
+ if( a[aix+k] != 0 )
+ vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
+
+ final int blocksizeK = 48;
+ final int blocksizeJ = 1024;
+
+ //blocked execution
+ for( int bk = rl+kn; bk < ru; bk+=blocksizeK )
+ for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+ //compute blocks of 4 rows in rhs w/ IKJ
+ int bjlen = Math.min(n, bj+blocksizeJ)-bj;
+ for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
+ for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
+ vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
+ b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
+ }
+ }
+ }
+
+ private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, double[] b, double[] c, int n2, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
+ //however, explicit flag required since dimension change m2
+ for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 )
+ for( int j=0, bix=0; j<n2; j++, bix+=cd )
+ c[cix+j] = dotProduct(a, b, aix, bix, cd);
+ }
+
+ private static void matrixMultDenseDenseMM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru, int cl, int cu)
+ throws DMLRuntimeException
+ {
+ //1) Unrolled inner loop (for better instruction-level parallelism)
+ //2) Blocked execution (for less cache trashing in parallel exec)
+ //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
+
+ final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
+ final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
+ final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
+
+ //temporary arrays (nnz a, b index)
+ double[] ta = new double[ blocksizeK ];
+ int[] tbi = new int[ blocksizeK ];
+
+ //blocked execution
+ for( int bi = rl; bi < ru; bi+=blocksizeI )
+ for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
+ for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ )
+ {
+ int bklen = bkmin-bk;
+ int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
+
+ //core sub block matrix multiplication
+ for( int i = bi; i < bimin; i++)
+ {
+ int aixi = i * cd + bk; //start index on a
+ int cixj = i * n + bj; //scan index on c
+
+ //determine nnz of a (for sparsity-aware skipping of rows)
+ int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+ //if( knnz > 0 ) //for skipping empty rows
+
+ //rest not aligned to blocks of 4 rows
+ final int bn = knnz % 4;
+ switch( bn ){
+ case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
+ case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
+ case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
+ }
+
+ //compute blocks of 4 rows (core inner loop)
+ for( int k = bn; k<knnz; k+=4 ){
+ vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c,
+ tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+ }
+ }
+ }
}
private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
@@ -1163,7 +1183,8 @@ public class LibMatrixMult
private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
throws DMLRuntimeException
- {
+ {
+ SparseBlock a = m1.sparseBlock;
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
@@ -1171,179 +1192,195 @@ public class LibMatrixMult
final int cd = m2.rlen;
final long xsp = (long)m*cd/m1.nonZeros;
- if( LOW_LEVEL_OPTIMIZATION )
- {
- SparseBlock a = m1.sparseBlock;
-
- if( m==1 && n==1 ) //DOT PRODUCT
- {
- if( !a.isEmpty(0) ) {
+ if( LOW_LEVEL_OPTIMIZATION ) {
+ if( m==1 && n==1 ) { //DOT PRODUCT
+ if( !a.isEmpty(0) )
c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
- }
}
- else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
- {
- for( int i=rl; i<ru; i++ )
- if( !a.isEmpty(i) )
- c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+ else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+ matrixMultSparseDenseMVShortRHS(a, b, c, rl, ru);
}
- else if( n==1 ) //MATRIX-VECTOR (tall rhs)
- {
- final int blocksizeI = 32;
- final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
- int[] curk = new int[blocksizeI];
-
- for( int bi = rl; bi < ru; bi+=blocksizeI ) {
- Arrays.fill(curk, 0); //reset positions
- for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
- for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- int k = curk[i-bi] + apos;
- for( ; k<apos+alen && aix[k]<bkmin; k++ )
- c[i] += avals[k] * b[aix[k]];
- curk[i-bi] = k - apos;
- }
- }
- }
+ else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
+ matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
}
- else if( pm2 && m==1 ) //VECTOR-MATRIX
- {
- //parallelization over rows in rhs matrix
- if( !a.isEmpty(0) )
- {
- int alen = a.size(0);
- int[] aix = a.indexes(0);
- double[] avals = a.values(0);
- int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
- rlix = (rlix>=0) ? rlix : alen;
-
- for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
- if( k+1<alen && aix[k+1]<ru )
- vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
- else
- vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
- }
- }
+ else if( pm2 && m==1 ) { //VECTOR-MATRIX
+ matrixMultSparseDenseVM(a, b, c, n, rl, ru);
}
- else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
- {
- int arlen = a.numRows();
- for( int i=0, cix=0; i<arlen; i++, cix+=n )
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
- k1 = (k1>=0) ? apos+k1 : apos+alen;
- int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
- k2 = (k2>=0) ? apos+k2 : apos+alen;
-
- //rest not aligned to blocks of 4 rows
- final int bn = (k2-k1) % 4;
- switch( bn ){
- case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
- case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
- case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
- }
-
- //compute blocks of 4 rows (core inner loop)
- for( int k = k1+bn; k<k2; k+=4 ) {
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
- }
- }
+ else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs)
+ matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
}
- else if( n<=64 ) //MATRIX-MATRIX (skinny rhs)
- {
- //no blocking since b and c fit into cache anyway
- for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
- if( a.isEmpty(i) )
- continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- //rest not aligned to blocks of 4 rows
- int bn = alen%4;
- for( int k=apos; k<apos+bn; k++ )
- vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
- //compute blocks of 4 rows (core inner loop)
- for( int k=apos+bn; k<apos+alen; k+=4 )
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
- }
+ else if( n<=64 ) { //MATRIX-MATRIX (skinny rhs)
+ matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
}
- else //MATRIX-MATRIX
- {
- //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size,
- //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
- //in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
- //output - however, in this case it's unlikely that we consume every cache line in the rhs
-
- final int blocksizeI = (int) (8L*m*cd/m1.nonZeros);
- final int blocksizeK = (int) (8L*m*cd/m1.nonZeros);
- final int blocksizeJ = 1024;
-
- //temporary array of current sparse positions
- int[] curk = new int[blocksizeI];
-
- //blocked execution over IKJ
- for( int bi = rl; bi < ru; bi+=blocksizeI ) {
- Arrays.fill(curk, 0); //reset positions
- for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
- for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
- int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-
- //core sub block matrix multiplication
- for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
- if( !a.isEmpty(i) ) {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- int k = curk[i-bi] + apos;
- //rest not aligned to blocks of 4 rows
- int bn = alen%4;
- for( ; k<apos+bn && aix[k]<bkmin; k++ )
- vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen);
- //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
- for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
- //update positions on last bj block
- if( bj+bjlen==n )
- curk[i-bi] = k - apos;
- }
- }
- }
- }
+ else { //MATRIX-MATRIX
+ matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
+ }
+ }
+ else {
+ for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++) {
+ double val = avals[k];
+ for(int j = 0, bix=aix[k]*n; j < n; j++)
+ c[cix+j] += val * b[bix+j];
}
}
}
- else
- {
- SparseBlock a = m1.sparseBlock;
- for( int i=rl, cix=rl*n; i<ru; i++, cix+=n )
- {
- if( !a.isEmpty(i) )
- {
+ }
+
+ private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, double[] b, double[] c, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ for( int i=rl; i<ru; i++ )
+ if( !a.isEmpty(i) )
+ c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+ }
+
+ private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, double[] b, double[] c, int cd, long xsp, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ final int blocksizeI = 32;
+ final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
+ int[] curk = new int[blocksizeI];
+
+ for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+ Arrays.fill(curk, 0); //reset positions
+ for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+ for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+ if( a.isEmpty(i) ) continue;
int apos = a.pos(i);
int alen = a.size(i);
int[] aix = a.indexes(i);
double[] avals = a.values(i);
+ int k = curk[i-bi] + apos;
+ for( ; k<apos+alen && aix[k]<bkmin; k++ )
+ c[i] += avals[k] * b[aix[k]];
+ curk[i-bi] = k - apos;
+ }
+ }
+ }
+ }
+
+ private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, double[] c, int n, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ if( a.isEmpty(0) )
+ return;
+
+ //parallelization over rows in rhs matrix
+ int alen = a.size(0);
+ int[] aix = a.indexes(0);
+ double[] avals = a.values(0);
+ int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
+ rlix = (rlix>=0) ? rlix : alen;
+
+ for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
+ if( k+1<alen && aix[k+1]<ru )
+ vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
+ else
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+ }
+ }
+
+ private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, double[] b, double[] c, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ int arlen = a.numRows();
+ for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
+ k1 = (k1>=0) ? apos+k1 : apos+alen;
+ int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
+ k2 = (k2>=0) ? apos+k2 : apos+alen;
+
+ //rest not aligned to blocks of 4 rows
+ final int bn = (k2-k1) % 4;
+ switch( bn ){
+ case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
+ case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+ case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+ }
+
+ //compute blocks of 4 rows (core inner loop)
+ for( int k = k1+bn; k<k2; k+=4 ) {
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+ }
+ }
+ }
+
+ private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, double[] b, double[] c, int n, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //no blocking since b and c fit into cache anyway
+ for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ //rest not aligned to blocks of 4 rows
+ int bn = alen%4;
+ for( int k=apos; k<apos+bn; k++ )
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
+ //compute blocks of 4 rows (core inner loop)
+ for( int k=apos+bn; k<apos+alen; k+=4 )
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+ }
+ }
+
+ private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, double[] c, int n, int cd, long xsp, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size,
+ //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
+ //in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
+ //output - however, in this case it's unlikely that we consume every cache line in the rhs
+ final int blocksizeI = (int) (8L*xsp);
+ final int blocksizeK = (int) (8L*xsp);
+ final int blocksizeJ = 1024;
+
+ //temporary array of current sparse positions
+ int[] curk = new int[blocksizeI];
+
+ //blocked execution over IKJ
+ for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+ Arrays.fill(curk, 0); //reset positions
+ for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+ for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+ int bjlen = Math.min(n, bj+blocksizeJ)-bj;
- for(int k = apos; k < apos+alen; k++) {
- double val = avals[k];
- for(int j = 0, bix=aix[k]*n; j < n; j++)
- c[cix+j] += val * b[bix+j];
+ //core sub block matrix multiplication
+ for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
+ if( !a.isEmpty(i) ) {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ int k = curk[i-bi] + apos;
+ //rest not aligned to blocks of 4 rows
+ int bn = alen%4;
+ for( ; k<apos+bn && aix[k]<bkmin; k++ )
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen);
+ //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
+ for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
+ //update positions on last bj block
+ if( bj+bjlen==n )
+ curk[i-bi] = k - apos;
+ }
}
}
}
@@ -1408,8 +1445,8 @@ public class LibMatrixMult
int[] aix = a.indexes(i);
double[] avals = a.values(i);
- int k = curk[i-bi] + apos;
- for(; k < apos+alen && aix[k]<bkmin; k++) {
+ int k = curk[i-bi] + apos;
+ for(; k < apos+alen && aix[k]<bkmin; k++) {
if( !b.isEmpty(aix[k]) )
vectMultiplyAdd(avals[k], b.values(aix[k]), c,
b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
@@ -1423,28 +1460,22 @@ public class LibMatrixMult
}
else
{
- for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n )
- {
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- for(int k = apos; k < apos+alen; k++)
- {
- double val = avals[k];
- if( !b.isEmpty(aix[k]) )
- {
- int bpos = b.pos(aix[k]);
- int blen = b.size(aix[k]);
- int[] bix = b.indexes(aix[k]);
- double[] bvals = b.values(aix[k]);
- for(int j = bpos; j < bpos+blen; j++)
- c[cix+bix[j]] += val * bvals[j];
- }
- }
+ for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ for(int k = apos; k < apos+alen; k++) {
+ if( b.isEmpty(aix[k]) ) continue;
+ double val = avals[k];
+ int bpos = b.pos(aix[k]);
+ int blen = b.size(aix[k]);
+ int[] bix = b.indexes(aix[k]);
+ double[] bvals = b.values(aix[k]);
+ for(int j = bpos; j < bpos+blen; j++)
+ c[cix+bix[j]] += val * bvals[j];
}
}
}