You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/09/13 17:27:04 UTC
incubator-systemml git commit: [SYSTEMML-913] Cache-conscious dense
matrix-vector multiplication
Repository: incubator-systemml
Updated Branches:
refs/heads/master 194a80f26 -> ca95d1363
[SYSTEMML-913] Cache-conscious dense matrix-vector multiplication
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ca95d136
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ca95d136
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ca95d136
Branch: refs/heads/master
Commit: ca95d13631402a24ba000406471c58375a211b7b
Parents: 194a80f
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Tue Sep 13 19:24:51 2016 +0200
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Tue Sep 13 19:26:10 2016 +0200
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixMult.java | 31 ++++++++++++++------
1 file changed, 22 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ca95d136/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 6902d40..9d0dd9b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1070,11 +1070,11 @@ public class LibMatrixMult
if( LOW_LEVEL_OPTIMIZATION )
{
- if( m==1 && n==1 ) //DOT PRODUCT
+ if( m==1 && n==1 ) //DOT PRODUCT
{
c[0] = dotProduct(a, b, cd);
}
- else if( n>1 && cd == 1 ) //OUTER PRODUCT
+ else if( n>1 && cd == 1 ) //OUTER PRODUCT
{
for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
if( a[i] == 1 )
@@ -1085,16 +1085,29 @@ public class LibMatrixMult
Arrays.fill(c, cix, cix+n, 0);
}
}
- else if( n==1 && cd == 1 ) //VECTOR-SCALAR
+ else if( n==1 && cd == 1 ) //VECTOR-SCALAR
{
vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
}
- else if( n==1 ) //MATRIX-VECTOR
+ else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
{
for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
- c[ i ] = dotProduct(a, b, aix, 0, cd);
+ c[i] = dotProduct(a, b, aix, 0, cd);
}
- else if( pm2 && m==1 ) //VECTOR-MATRIX
+ else if( n==1 ) //MATRIX-VECTOR (tall rhs)
+ {
+ final int blocksizeI = 32;
+ final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+ for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+ int bimin = Math.min(bi+blocksizeI, ru);
+ for( int bk=0; bk<cd; bk+=blocksizeK ) {
+ int bkmin = Math.min(bk+blocksizeK, cd);
+ for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd)
+ c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+ }
+ }
+ }
+ else if( pm2 && m==1 ) //VECTOR-MATRIX
{
//parallelization over rows in rhs matrix
//rest not aligned to blocks of 2 rows
@@ -1112,7 +1125,7 @@ public class LibMatrixMult
vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
}
}
- else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
+ else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
{
//cache-conscious parallelization over rows in rhs matrix
final int kn = (ru-rl)%4;
@@ -1139,7 +1152,7 @@ public class LibMatrixMult
}
}
}
- else if( tm2 ) //MATRIX-MATRIX (skinny rhs)
+ else if( tm2 ) //MATRIX-MATRIX (skinny rhs)
{
//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
//however, explicit flag required since dimension change m2
@@ -1148,7 +1161,7 @@ public class LibMatrixMult
for( int j=0, bix=0; j<n2; j++, bix+=cd )
c[cix+j] = dotProduct(a, b, aix, bix, cd);
}
- else //MATRIX-MATRIX
+ else //MATRIX-MATRIX
{
//1) Unrolled inner loop (for better instruction-level parallelism)
//2) Blocked execution (for less cache trashing in parallel exec)