You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/03/28 01:01:50 UTC
[systemds] branch master updated: [MINOR] Improved selection of multi-threaded matrix multiplications

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 13dd8cb  [MINOR] Improved selection of multi-threaded matrix multiplications
13dd8cb is described below

commit 13dd8cb68260cec6692b2006271f651150519e42
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Mar 28 03:00:40 2021 +0200

    [MINOR] Improved selection of multi-threaded matrix multiplications
    
    This patch makes the validity checks for multi-threaded matrix
    multiplications (which check for a maximum memory overhead, scaled by
    the number of threads) more flexible and thus amenable on scale-up nodes
    (large mem, # cores) to matrices with millions of features.
    
    On training lm models on KDD98, this change improved end-to-end
    performance by 37x on a server with 56 physical / 112 virtual cores. On
    KDD98 lmCG takes ~27,000 batch iterations to convergence (1e-7 target
    norm) and now completes in 310s.
---
 .../java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 9db9754..a402134 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3893,16 +3893,18 @@ public class LibMatrixMult
 	
 	public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
 		boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
+		double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
 		return k > 1 && LOW_LEVEL_OPTIMIZATION
-			&& (!checkMem || 8L * m1.clen * k < MEM_OVERHEAD_THRESHOLD)
+			&& (!checkMem || 8L * m1.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
 			&& (!checkFLOPs || FPfactor * m1.rlen * m1.clen >
 			(sharedTP ? PAR_MINFLOP_THRESHOLD2 : PAR_MINFLOP_THRESHOLD1));
 	}
 	
 	public static boolean satisfiesMultiThreadingConstraints(MatrixBlock m1, MatrixBlock m2, boolean checkMem, boolean checkFLOPs, long FPfactor, int k) {
 		boolean sharedTP = (InfrastructureAnalyzer.getLocalParallelism() == k);
+		double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
 		return k > 1 && LOW_LEVEL_OPTIMIZATION
-			&& (!checkMem || 8L * m2.clen * k < MEM_OVERHEAD_THRESHOLD)
+			&& (!checkMem || 8L * m2.clen * k < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem))
 			//note: cast to double to avoid long overflows on ultra-sparse matrices
 			//due to FLOP computation based on number of cells not non-zeros
 			&& (!checkFLOPs || (double)FPfactor * m1.rlen * m1.clen * m2.clen >