You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/04/21 09:14:06 UTC

[1/4] systemml git commit: [SYSTEMML-2266] Fix native BLAS integration for large dense blocks >16GB

Repository: systemml
Updated Branches:
  refs/heads/master 54c52ab3c -> a22502583


[SYSTEMML-2266] Fix native BLAS integration for large dense blocks >16GB

This patch fixes issues of native BLAS matrix multiply calls for
large-dense blocks (inputs or outputs) which currently only access the
first block and thus fails with an exception indicating incorrect
results. The native BLAS call is now conditioned on contiguous blocks
and otherwise we fall back to our default operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c1a7f855
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c1a7f855
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c1a7f855

Branch: refs/heads/master
Commit: c1a7f855b605d80eabc9de833d202b55b4319639
Parents: 54c52ab
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 19:39:27 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 19:39:27 2018 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/runtime/matrix/data/LibMatrixNative.java  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c1a7f855/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index cf4501f..e122e7f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -79,9 +79,11 @@ public class LibMatrixNative
 			return;
 		}
 		
-		if (NativeHelper.isNativeLibraryLoaded()
+		if( NativeHelper.isNativeLibraryLoaded()
 			&& !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) 
-			&& !m1.isInSparseFormat() && !m2.isInSparseFormat()) 
+			&& !m1.isInSparseFormat() && !m2.isInSparseFormat()
+			&& m1.getDenseBlock().isContiguous() && m2.getDenseBlock().isContiguous()
+			&& 8L * ret.getLength() < Integer.MAX_VALUE ) //contiguous but not allocated
 		{
 			ret.sparse = false;
 			ret.allocateDenseBlock();


[2/4] systemml git commit: [SYSTEMML-2267] Generalized multi-threaded unary ops dense blocks >16GB

Posted by mb...@apache.org.
[SYSTEMML-2267] Generalized multi-threaded unary ops dense blocks >16GB

This patch generalized the newly introduced multi-threaded unary
operations for large dense blocks >16GB by processing a physical block
at a time via parallelSetAll.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2f278bc2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2f278bc2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2f278bc2

Branch: refs/heads/master
Commit: 2f278bc2ac85d391b9353124ce85b7db884cba5b
Parents: c1a7f85
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 19:44:37 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 19:44:37 2018 -0700

----------------------------------------------------------------------
 .../apache/sysml/runtime/matrix/data/MatrixBlock.java    | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2f278bc2/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index bb5e79b..9e032b6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -2584,15 +2584,18 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			else
 				LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op);
 		}
-		else if(!sparse && !isEmptyBlock(false) && getDenseBlock().isContiguous()
+		else if(!sparse && !isEmptyBlock(false)
 			&& OptimizerUtils.isMaxLocalParallelism(op.getNumThreads())) {
 			//note: we apply multi-threading in a best-effort manner here
 			//only for expensive operators such as exp, log, sigmoid, because
 			//otherwise allocation, read and write anyway dominates
 			ret.allocateDenseBlock(false);
-			double[] a = getDenseBlockValues();
-			double[] c = ret.getDenseBlockValues();
-			Arrays.parallelSetAll(c, i -> op.fn.execute(a[i]));
+			DenseBlock a = getDenseBlock();
+			DenseBlock c = ret.getDenseBlock();
+			for(int bi=0; bi<a.numBlocks(); bi++) {
+				double[] avals = a.valuesAt(bi), cvals = c.valuesAt(bi);
+				Arrays.parallelSetAll(cvals, i -> op.fn.execute(avals[i]));
+			}
 			ret.recomputeNonZeros();
 		}
 		else {


[4/4] systemml git commit: [MINOR] Cleanup dense matrix-vector wsigmoid via native matrixMult

Posted by mb...@apache.org.
[MINOR] Cleanup dense matrix-vector wsigmoid via native matrixMult

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a2250258
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a2250258
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a2250258

Branch: refs/heads/master
Commit: a22502583437efcf7c68619eefdde7c0c4bff74f
Parents: aff0009
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Apr 21 02:14:47 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Apr 21 02:14:47 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 26 ++++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a2250258/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 622f45c..ddf063f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -2406,6 +2406,9 @@ public class LibMatrixMult
 		boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS); 
 		boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
 		
+		//note: experiments with a fully native implementation of this method (even with #pragma omp simd)
+		//showed performance regressions compared to this version because we benefit from FastMath.exp 
+		
 		//call native matrix multiplication (only called for single-threaded and matrix-vector
 		//because this ensures that we can deal with the transpose mV without additional transpose)
 		if(!NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
@@ -2413,12 +2416,13 @@ public class LibMatrixMult
 			throw new DMLRuntimeException("Error executing native matrix mult.");
 		
 		//compute remaining wsigmoid for all relevant outputs
-		for(int i=0, ix=0; i<m; i++, ix+=n) {
-			for(int j=0; j<n; j++) {
-				double wij = w[ix +j];
-				//if( wij != 0 )
-					c[ix+j] = wsigmoid(wij, c[ix+j], flagminus, flaglog);
-			}
+		for( int i=0; i<m*n; i++ ) {
+			//compute core sigmoid function
+			double cval = flagminus ?
+				1 / (1 + FastMath.exp(c[i])) :
+				1 / (1 + FastMath.exp(-c[i]));
+			//compute weighted output
+			c[i] = w[i] * ((flaglog) ? Math.log(cval) : cval);
 		}
 	}
 	
@@ -3501,16 +3505,6 @@ public class LibMatrixMult
 		return wij * ((flaglog) ? Math.log(cval) : cval);
 	}
 	
-	private static double wsigmoid(final double wij, final double uvij, final boolean flagminus, final boolean flaglog) {
-		//compute core sigmoid function
-		double cval = flagminus ?
-				1 / (1 + FastMath.exp(uvij)) :
-				1 / (1 + FastMath.exp(-uvij));
-		
-		//compute weighted output
-		return wij * ((flaglog) ? Math.log(cval) : cval);
-	}
-
 	private static void wdivmm( final double wij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean mult, final boolean minus, final int len )
 	{
 		//compute dot product over ui vj


[3/4] systemml git commit: [SYSTEMML-2268] Performance native BLAS (dispatch DOT, GEMV, GEMM)

Posted by mb...@apache.org.
[SYSTEMML-2268] Performance native BLAS (dispatch DOT, GEMV, GEMM)

This patch improves the performance of native BLAS matrix multiply
operations for special (but common) cases of vector-vector dot products
and matrix-vectors by dispatching BLAS calls to DOT, GEMV, and GEMM
according to input dimensions instead of always calling GEMM. In detail,
this applies to mm and tsmm operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/aff00094
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/aff00094
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/aff00094

Branch: refs/heads/master
Commit: aff000942ec8bdcbec4e219a2e86cc5c85e7b2ea
Parents: 2f278bc
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 23:58:51 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 23:58:51 2018 -0700

----------------------------------------------------------------------
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 32208 -> 32376 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 31288 -> 31464 bytes
 src/main/cpp/libmatrixmult.cpp                  |  31 +++++++++++++++----
 .../runtime/matrix/data/LibMatrixNative.java    |   8 ++---
 4 files changed, 28 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/aff00094/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index adc3bbe..fb1d33e 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/aff00094/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index 0b39eaa..8905252 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/aff00094/src/main/cpp/libmatrixmult.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index 773a85a..868fd24 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -42,18 +42,37 @@ void setNumThreadsForBLAS(int numThreads) {
 }
  
 void dmatmult(double* m1Ptr, double* m2Ptr, double* retPtr, int m, int k, int n, int numThreads) {
+  //BLAS routine dispatch according to input dimension sizes (we don't use cblas_dgemv 
+  //with CblasColMajor for matrix-vector because it was generally slower than dgemm)
   setNumThreadsForBLAS(numThreads);
-  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
+  if( m == 1 && n == 1 ) //VV
+    retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1);
+  else if( n == 1 ) //MV
+    cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);  
+  else //MM
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
 }
 
 void smatmult(float* m1Ptr, float* m2Ptr, float* retPtr, int m, int k, int n, int numThreads) {  
+  //BLAS routine dispatch according to input dimension sizes (we don't use cblas_sgemv 
+  //with CblasColMajor for matrix-vector because it was generally slower than sgemm)
   setNumThreadsForBLAS(numThreads);
-  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
+  if( m == 1 && n == 1 ) //VV
+    retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1);
+  else if( n == 1 ) //MV
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);  
+  else //MM
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
 }
 
-void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads) {
-  int n = isLeftTrans ? m1clen : m1rlen;
-  int k = isLeftTrans ? m1rlen : m1clen;
+void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool leftTrans, int numThreads) {
   setNumThreadsForBLAS(numThreads);
-  cblas_dsyrk(CblasRowMajor, CblasUpper, isLeftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n);
+  if( (leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1) ) {
+    retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1);
+  }
+  else { //general case
+    int n = leftTrans ? m1clen : m1rlen;
+    int k = leftTrans ? m1rlen : m1clen;
+    cblas_dsyrk(CblasRowMajor, CblasUpper, leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n);
+  }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/aff00094/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index e122e7f..9fec026 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -123,7 +123,7 @@ public class LibMatrixNative
 	public static void tsmm(MatrixBlock m1, MatrixBlock ret, boolean leftTrans, int k) {
 		if( m1.isEmptyBlock(false) )
 			return;
-		if( NativeHelper.isNativeLibraryLoaded() && ret.clen > 1 
+		if( NativeHelper.isNativeLibraryLoaded() && (ret.clen > 1 || ret.getLength()==1)
 			&& (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) {
 			ret.sparse = false;
 			ret.allocateDenseBlock();
@@ -136,10 +136,8 @@ public class LibMatrixNative
 				ret.examSparsity();
 				return;
 			}
-			else {
-				Statistics.incrementNativeFailuresCounter();
-				//fallback to default java implementation
-			}
+			//fallback to default java implementation
+			Statistics.incrementNativeFailuresCounter();
 		}
 		if( k > 1 )
 			LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTrans, k);