You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/04/20 08:46:00 UTC

[1/7] systemml git commit: [SYSTEMML-1349, 2251] Extended parfor optimizer for shared reads

Repository: systemml
Updated Branches:
  refs/heads/master 09b1533de -> 4c7640b87


[SYSTEMML-1349,2251] Extended parfor optimizer for shared reads

This patch resolves a long standing shortcoming of the parfor optimizer,
of setting the degree of parallelism too low because shared reads were
not handled properly and thus double counted for each worker. Especially
on modern processors with increasing number of cores this became more
important to reduce unnecessary barriers (a too low parfor degree of
parallelism leads to distributing the remaining par to included parfor
loops and operations which require barriers per operation).
 

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/9a089151
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/9a089151
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/9a089151

Branch: refs/heads/master
Commit: 9a089151bf2bc518fe9eaf5b3fe9505f237dc399
Parents: 09b1533
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Apr 19 18:57:51 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Apr 19 18:57:51 2018 -0700

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/Hop.java    | 48 ++++++++--------
 .../parfor/opt/CostEstimator.java               | 22 ++++---
 .../parfor/opt/CostEstimatorHops.java           |  9 ++-
 .../parfor/opt/OptimizerConstrained.java        |  8 +--
 .../parfor/opt/OptimizerRuleBased.java          | 60 +++++++++++++-------
 5 files changed, 86 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/9a089151/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index 9445220..059132d 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.hops;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 
@@ -459,6 +460,19 @@ public abstract class Hop implements ParseInfo
 		return _outputEmptyBlocks;
 	}
 	
+
+	protected double getInputOutputSize() {
+		return _outputMemEstimate
+			+ _processingMemEstimate
+			+ getInputSize();
+	}
+	
+	public double getInputOutputSize(Collection<String> exclVars) {
+		return _outputMemEstimate
+			+ _processingMemEstimate
+			+ getInputSize(exclVars);
+	}
+	
 	/**
 	 * Returns the memory estimate for the output produced from this Hop.
 	 * It must be invoked only within Hops. From outside Hops, one must 
@@ -467,21 +481,22 @@ public abstract class Hop implements ParseInfo
 	 * 
 	 * @return output size memory estimate
 	 */
-	protected double getOutputSize() 
-	{
+	protected double getOutputSize() {
 		return _outputMemEstimate;
 	}
+	
+	protected double getInputSize() {
+		return getInputSize(null);
+	}
 
-	protected double getInputSize() 
-	{
-		double sum = 0;		
+	protected double getInputSize(Collection<String> exclVars) {
+		double sum = 0;
 		int len = _input.size();
-		
-		for( int i=0; i<len; i++ ) //for all inputs
-		{
+		for( int i=0; i<len; i++ ) { //for all inputs
 			Hop hi = _input.get(i);
+			if( exclVars != null && exclVars.contains(hi.getName()) )
+				continue;
 			double hmout = hi.getOutputMemEstimate();
-			
 			if( hmout > 1024*1024 ) {//for relevant sizes
 				//check if already included in estimate (if an input is used
 				//multiple times it is still only required once in memory)
@@ -491,24 +506,9 @@ public abstract class Hop implements ParseInfo
 					flag |= (hi == _input.get(j));
 				hmout = flag ? 0 : hmout;
 			}
-			
 			sum += hmout;
 		}
 		
-		//for(Hop h : _input ) {
-		//	sum += h._outputMemEstimate;
-		//}
-		
-		return sum;
-	}
-
-	protected double getInputOutputSize() 
-	{
-		double sum = 0;
-		sum += _outputMemEstimate;
-		sum += _processingMemEstimate;
-		sum += getInputSize();
-		
 		return sum;
 	}
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/9a089151/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimator.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimator.java
index 167120f..c726308 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimator.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimator.java
@@ -26,7 +26,6 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.sysml.lops.LopProperties.ExecType;
-import org.apache.sysml.parser.ParForStatementBlock.ResultVar;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ParamType;
 
 /**
@@ -47,17 +46,20 @@ public abstract class CostEstimator
 	public static final double DEFAULT_MEM_ESTIMATE_MR = 20*1024*1024; //default memory consumption: 20MB 
 
 	public enum TestMeasure {
-		EXEC_TIME,
-		MEMORY_USAGE
+		EXEC_TIME, MEMORY_USAGE
 	}
 	
 	public enum DataFormat {
-		DENSE,
-		SPARSE
+		DENSE, SPARSE
+	}
+	
+	public enum ExcludeType {
+		NONE, SHARED_READ, RESULT_LIX
 	}
 	
 	protected boolean _inclCondPart = false;
-	protected Collection<ResultVar> _exclRetVars = null;
+	protected Collection<String> _exclVars = null;
+	protected ExcludeType _exclType = ExcludeType.NONE;
 	
 	/**
 	 * Main leaf node estimation method - to be overwritten by specific cost estimators
@@ -102,12 +104,14 @@ public abstract class CostEstimator
 		return val;
 	}
 	
-	public double getEstimate(TestMeasure measure, OptNode node, boolean inclCondPart, Collection<ResultVar> retVars) {
+	public double getEstimate(TestMeasure measure, OptNode node, boolean inclCondPart, Collection<String> vars, ExcludeType extype) {
 		_inclCondPart = inclCondPart; //temporary
-		_exclRetVars = retVars;
+		_exclVars = vars;
+		_exclType = extype;
 		double val = getEstimate(measure, node, null);
 		_inclCondPart = false; 
-		_exclRetVars = null;
+		_exclVars = null;
+		_exclType = ExcludeType.NONE;
 		return val;
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/9a089151/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimatorHops.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimatorHops.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimatorHops.java
index 3cb0c6f..4c3e7f0 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimatorHops.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/CostEstimatorHops.java
@@ -24,7 +24,6 @@ import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.LeftIndexingOp;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.LopProperties.ExecType;
-import org.apache.sysml.parser.ParForStatementBlock.ResultVar;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.NodeType;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.Optimizer.CostModelType;
@@ -61,6 +60,10 @@ public class CostEstimatorHops extends CostEstimator
 		Hop h = _map.getMappedHop( node.getID() );
 		double value = h.getMemEstimate();
 		
+		//correction for disabled shared read accounting
+		value = (_exclVars!=null && _exclType==ExcludeType.SHARED_READ) ?
+			h.getInputOutputSize(_exclVars) : value;
+		
 		//handle specific cases 
 		double DEFAULT_MEM_REMOTE = OptimizerUtils.isSparkExecutionMode() ? 
 			DEFAULT_MEM_SP : DEFAULT_MEM_MR;
@@ -100,8 +103,8 @@ public class CostEstimatorHops extends CostEstimator
 		}
 		
 		//correction for disabled result indexing
-		value = (_exclRetVars!=null && h instanceof LeftIndexingOp
-			&& ResultVar.contains(_exclRetVars, h.getName())) ? 0 : value;
+		value = (_exclVars!=null && _exclType==ExcludeType.RESULT_LIX 
+			&& h instanceof LeftIndexingOp && _exclVars.contains(h.getName())) ? 0 : value;
 		
 		if( LOG.isTraceEnabled() ) {
 			LOG.trace("Memory estimate "+h.getName()+", "+h.getOpString()

http://git-wip-us.apache.org/repos/asf/systemml/blob/9a089151/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerConstrained.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerConstrained.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerConstrained.java
index 4088e63..deedf29 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerConstrained.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerConstrained.java
@@ -152,7 +152,7 @@ public class OptimizerConstrained extends OptimizerRuleBased
 			super.rewriteSetExportReplicationFactor( pn, ec.getVariables() );
 
 			// rewrite 10: determine parallelism
-			rewriteSetDegreeOfParallelism( pn, M1, false );
+			rewriteSetDegreeOfParallelism( pn, _cost, ec.getVariables(), M1, false );
 
 			// rewrite 11: task partitioning 
 			rewriteSetTaskPartitioner( pn, false, flagLIX );
@@ -174,7 +174,7 @@ public class OptimizerConstrained extends OptimizerRuleBased
 		else //if( pn.getExecType() == ExecType.CP )
 		{
 			// rewrite 10: determine parallelism
-			rewriteSetDegreeOfParallelism( pn, M1, false );
+			rewriteSetDegreeOfParallelism( pn, _cost, ec.getVariables(), M1, false );
 
 			// rewrite 11: task partitioning
 			rewriteSetTaskPartitioner( pn, false, false ); //flagLIX always false 
@@ -282,7 +282,7 @@ public class OptimizerConstrained extends OptimizerRuleBased
 	///
 
 	@Override
-	protected void rewriteSetDegreeOfParallelism(OptNode n, double M, boolean flagNested) {
+	protected void rewriteSetDegreeOfParallelism(OptNode n, CostEstimator cost, LocalVariableMap vars, double M, boolean flagNested) {
 		// constraint awareness
 		if( n.getK() > 0 && ConfigurationManager.isParallelParFor() )
 		{
@@ -299,7 +299,7 @@ public class OptimizerConstrained extends OptimizerRuleBased
 			LOG.debug(getOptMode()+" OPT: forced 'set degree of parallelism' - result=(see EXPLAIN)" );
 		}
 		else
-			super.rewriteSetDegreeOfParallelism(n, M, flagNested);
+			super.rewriteSetDegreeOfParallelism(n, cost, vars, M, flagNested);
 	}
 
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/9a089151/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index 2bf8fe5..91124a0 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map.Entry;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -81,6 +82,7 @@ import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter;
 import org.apache.sysml.runtime.controlprogram.parfor.ResultMergeLocalFile;
+import org.apache.sysml.runtime.controlprogram.parfor.opt.CostEstimator.ExcludeType;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.CostEstimator.TestMeasure;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType;
 import org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.NodeType;
@@ -271,7 +273,7 @@ public class OptimizerRuleBased extends Optimizer
 			rewriteSetExportReplicationFactor( pn, ec.getVariables() );
 			
 			// rewrite 10: determine parallelism
-			rewriteSetDegreeOfParallelism( pn, M1, false );
+			rewriteSetDegreeOfParallelism( pn, _cost, ec.getVariables(), M1, false );
 			
 			// rewrite 11: task partitioning 
 			rewriteSetTaskPartitioner( pn, false, flagLIX );
@@ -292,7 +294,7 @@ public class OptimizerRuleBased extends Optimizer
 		else //if( pn.getExecType() == ExecType.CP )
 		{
 			// rewrite 10: determine parallelism
-			rewriteSetDegreeOfParallelism( pn, M1, false );
+			rewriteSetDegreeOfParallelism( pn, _cost, ec.getVariables(), M1, false );
 			
 			// rewrite 11: task partitioning
 			rewriteSetTaskPartitioner( pn, false, false ); //flagLIX always false 
@@ -1173,25 +1175,36 @@ public class OptimizerRuleBased extends Optimizer
 	//REWRITE set degree of parallelism
 	///
 
-	protected void rewriteSetDegreeOfParallelism(OptNode n, double M, boolean flagNested) 
+	protected void rewriteSetDegreeOfParallelism(OptNode n, CostEstimator cost, LocalVariableMap vars, double M, boolean flagNested) 
 	{
 		ExecType type = n.getExecType();
 		long id = n.getID();
-				
+		
 		//special handling for different exec models (CP, MR, MR nested)
-		ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter
-										.getAbstractPlanMapping().getMappedProg(id)[1];
+		Object[] map = OptTreeConverter.getAbstractPlanMapping().getMappedProg(id);
+		ParForStatementBlock pfsb = (ParForStatementBlock)map[0];
+		ParForProgramBlock pfpb = (ParForProgramBlock)map[1];
 		
 		if( type == ExecType.CP ) 
 		{
 			//determine local max parallelism constraint
 			int kMax = ConfigurationManager.isParallelParFor() ?
-					(n.isCPOnly() ? _lkmaxCP : _lkmaxMR) : 1;
+				(n.isCPOnly() ? _lkmaxCP : _lkmaxMR) : 1;
+			
+			//compute memory budgets and partial estimates for handling shared reads
+			double mem = (OptimizerUtils.isSparkExecutionMode() && !n.isCPOnly()) ? _lm/2 : _lm;
+			double sharedM = 0, nonSharedM = M;
+			if( computeMaxK(M, M, 0, mem) < kMax ) { //account for shared read if necessary
+				sharedM = pfsb.getReadOnlyParentVars().stream().map(s -> vars.get(s))
+					.filter(d -> d instanceof MatrixObject).mapToDouble(mo -> OptimizerUtils
+					.estimateSize(((MatrixObject)mo).getMatrixCharacteristics())).sum();
+				nonSharedM = cost.getEstimate(TestMeasure.MEMORY_USAGE, n, true,
+					pfsb.getReadOnlyParentVars(), ExcludeType.SHARED_READ);
+			}
 			
 			//ensure local memory constraint (for spark more conservative in order to 
 			//prevent unnecessary guarded collect)
-			double mem = (OptimizerUtils.isSparkExecutionMode() && !n.isCPOnly()) ? _lm/2 : _lm;
-			kMax = Math.min( kMax, (int)Math.floor( mem / M ) );
+			kMax = Math.min( kMax, computeMaxK(M, nonSharedM, sharedM, mem) );
 			kMax = Math.max( kMax, 1);
 			
 			//constrain max parfor parallelism by problem size
@@ -1226,21 +1239,17 @@ public class OptimizerRuleBased extends Optimizer
 		else // ExecType.MR/ExecType.SPARK
 		{
 			int kMax = -1;
-			if( flagNested )
-			{
+			if( flagNested ) {
 				//determine remote max parallelism constraint
 				pfpb.setDegreeOfParallelism( _rnk ); //guaranteed <= _N (see nested)
-				n.setK( _rnk );	
-			
+				n.setK( _rnk );
 				kMax = _rkmax / _rnk; //per node (CP only inside)
 			}
-			else //not nested (default)
-			{
+			else { //not nested (default)
 				//determine remote max parallelism constraint
 				int tmpK = (int)((_N<_rk)? _N : _rk);
 				pfpb.setDegreeOfParallelism(tmpK);
-				n.setK(tmpK);	
-				
+				n.setK(tmpK);
 				kMax = _rkmax / tmpK; //per node (CP only inside)
 			}
 			
@@ -1252,14 +1261,22 @@ public class OptimizerRuleBased extends Optimizer
 			//disable nested parallelism, if required
 			if( !ALLOW_REMOTE_NESTED_PARALLELISM )
 				kMax = 1;
-					
+			
 			//distribute remaining parallelism and recompile parallel instructions
-			rAssignRemainingParallelism( n, kMax, 1 ); 
-		}		
+			rAssignRemainingParallelism( n, kMax, 1 );
+		}
 		
 		_numEvaluatedPlans++;
 		LOG.debug(getOptMode()+" OPT: rewrite 'set degree of parallelism' - result=(see EXPLAIN)" );
 	}
+	
+	private int computeMaxK(double M, double memNonShared, double memShared, double memBudget) {
+		//note: we compute max K for both w/o and w/ shared reads and take the max, because
+		//the latter might reduce the degree of parallelism if shared reads don't dominate
+		int k1 = (int)Math.floor(memBudget / M);
+		int k2 = (int)Math.floor(memBudget-memShared / memNonShared);
+		return Math.max(k1, k2);
+	}
 
 	protected void rAssignRemainingParallelism(OptNode n, int parforK, int opsK) 
 	{
@@ -1644,7 +1661,8 @@ public class OptimizerRuleBased extends Optimizer
 			double sum = computeTotalSizeResultVariables(retVars, vars, pfpb.getDegreeOfParallelism());
 		
 			//compute memory estimate without result indexing, and total sum per worker
-			double M = cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true, retVars);
+			double M = cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true, retVars.stream()
+				.map(var -> var._name).collect(Collectors.toList()), ExcludeType.RESULT_LIX);
 			totalMem = M + sum;
 			
 			//result update in-place for MR/Spark (w/ remote memory constraint)


[4/7] systemml git commit: [SYSTEMML-2260] New native tsmm matrix mult and its integration

Posted by mb...@apache.org.
[SYSTEMML-2260] New native tsmm matrix mult and its integration

This patch fixes the existing native tsmm implementation to properly use
a BLAS dsyrk instead of BLAS dgemm to performance the computation only
for the upper triangular output matrix. Subsequently, we use the
existing cache-conscious primitive to flip the upper to the lower
triangle. 

Furthermore, this patch also integrates the native tsmm implementation,
which was not used at all so far.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2b8161db
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2b8161db
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2b8161db

Branch: refs/heads/master
Commit: 2b8161db97b55b612e70280d45fcec53421dc813
Parents: fde708f
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Apr 19 22:22:07 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Apr 19 22:22:07 2018 -0700

----------------------------------------------------------------------
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 32168 -> 32208 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 31240 -> 31288 bytes
 src/main/cpp/libmatrixmult.cpp                  |  10 +++----
 src/main/cpp/libmatrixmult.h                    |   2 +-
 src/main/cpp/systemml.cpp                       |   4 +--
 .../runtime/matrix/data/LibMatrixNative.java    |  27 +++++++++++++++++++
 .../sysml/runtime/matrix/data/MatrixBlock.java  |   4 ++-
 .../org/apache/sysml/utils/NativeHelper.java    |   2 +-
 8 files changed, 38 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index faaf5f4..adc3bbe 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index 16b0b5d..0b39eaa 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/libmatrixmult.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index 3c669b6..773a85a 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -51,11 +51,9 @@ void smatmult(float* m1Ptr, float* m2Ptr, float* retPtr, int m, int k, int n, in
   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
 }
 
-void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose, int numThreads) {
-  int m = isLeftTranspose ? m1clen : m1rlen;
-  int n = isLeftTranspose ? m1clen : m1rlen;
-  int k = isLeftTranspose ? m1rlen : m1clen;
-  
+void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads) {
+  int n = isLeftTrans ? m1clen : m1rlen;
+  int k = isLeftTrans ? m1rlen : m1clen;
   setNumThreadsForBLAS(numThreads);
-  cblas_dgemm(CblasRowMajor, isLeftTranspose ? CblasTrans : CblasNoTrans, isLeftTranspose ? CblasNoTrans : CblasTrans, m, n, k, 1, m1Ptr, k, m1Ptr, n, 0, retPtr, n);
+  cblas_dsyrk(CblasRowMajor, CblasUpper, isLeftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n);
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/libmatrixmult.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
index b6ea1c4..1c7fcd9 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/libmatrixmult.h
@@ -56,6 +56,6 @@ void setNumThreadsForBLAS(int numThreads);
 void dmatmult(double* m1Ptr, double* m2Ptr, double* retPtr, int m, int k, int n, int numThreads);
 void smatmult(float* m1Ptr, float* m2Ptr, float* retPtr, int m, int k, int n, int numThreads);
 
-void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose,  int numThreads);
+void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads);
 
 #endif

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
index b404cc9..fae0c1e 100644
--- a/src/main/cpp/systemml.cpp
+++ b/src/main/cpp/systemml.cpp
@@ -109,13 +109,13 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd(
 }
 
 JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
-  (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean isLeftTranspose, jint numThreads) {
+  (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean leftTrans, jint numThreads) {
   double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(m1Ptr == NULL || retPtr == NULL)
   	return (jboolean) false;
 
-  tsmm(m1Ptr, retPtr, (int) m1rlen, (int) m1clen, (bool) isLeftTranspose, (int) numThreads);
+  tsmm(m1Ptr, retPtr, (int)m1rlen, (int)m1clen, (bool)leftTrans, (int)numThreads);
   
   RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index eade43f..cf4501f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -118,6 +118,33 @@ public class LibMatrixNative
 			LibMatrixMult.matrixMult(m1, m2, ret, k);
 	}
 	
+	public static void tsmm(MatrixBlock m1, MatrixBlock ret, boolean leftTrans, int k) {
+		if( m1.isEmptyBlock(false) )
+			return;
+		if( NativeHelper.isNativeLibraryLoaded() && ret.clen > 1 
+			&& (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) {
+			ret.sparse = false;
+			ret.allocateDenseBlock();
+			if( NativeHelper.tsmm(m1.getDenseBlockValues(), 
+				ret.getDenseBlockValues(), m1.rlen, m1.clen, leftTrans, k) ) 
+			{
+				long nnz = (ret.clen==1) ? ret.recomputeNonZeros() :
+					LibMatrixMult.copyUpperToLowerTriangle(ret);
+				ret.setNonZeros(nnz);
+				ret.examSparsity();
+				return;
+			}
+			else {
+				Statistics.incrementNativeFailuresCounter();
+				//fallback to default java implementation
+			}
+		}
+		if( k > 1 )
+			LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTrans, k);
+		else
+			LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTrans);
+	}
+	
 	/**
 	 * This method performs convolution (i.e. cross-correlation) operation on input
 	 * 

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index f306a7a..97f883b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3434,7 +3434,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			out.reset(dim, dim, false);
 		
 		//compute matrix mult
-		if( k > 1 )
+		if( NativeHelper.isNativeLibraryLoaded() )
+			LibMatrixNative.tsmm(this, out, leftTranspose, k);
+		else if( k > 1 )
 			LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose, k);
 		else
 			LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
index 86d849f..1a02e00 100644
--- a/src/main/java/org/apache/sysml/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -322,7 +322,7 @@ public class NativeHelper {
 	//single-precision matrix multiply dense-dense
 	public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads);
 	//transpose-self matrix multiply
-	private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
+	public static native boolean tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads);
 
 	// ----------------------------------------------------------------------------------------------------------------
 	// LibMatrixDNN operations:


[7/7] systemml git commit: [SYSTEMML-2262] New multi-threaded dense unary ops (exp, log, sigmoid)

Posted by mb...@apache.org.
[SYSTEMML-2262] New multi-threaded dense unary ops (exp, log, sigmoid)

This patch introduces a best effort multi-threading for expensive dense
unary operations such as exp, log, and sigmoid. While for other unary
operations, the output allocation, read and write anyway dominate, these
operations experience a substantial speedup with multi-threading, which
we simply apply via a parallelSetAll.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4c7640b8
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4c7640b8
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4c7640b8

Branch: refs/heads/master
Commit: 4c7640b873dfa921409cbef375c856aa48940932
Parents: 45d86bd
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 01:39:15 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 01:39:15 2018 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/OptimizerUtils.java   | 16 ++++-----
 .../java/org/apache/sysml/hops/UnaryOp.java     | 36 ++++++++++++--------
 src/main/java/org/apache/sysml/lops/Unary.java  |  9 +++--
 .../parfor/opt/OptimizerRuleBased.java          |  3 +-
 .../instructions/CPInstructionParser.java       |  4 ++-
 .../instructions/cp/UnaryCPInstruction.java     |  2 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 17 ++++++---
 7 files changed, 54 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index 5ae20eb..2d76759 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -414,30 +414,30 @@ public class OptimizerUtils
 	 * 
 	 * @return local memory budget
 	 */
-	public static double getLocalMemBudget()
-	{
+	public static double getLocalMemBudget() {
 		double ret = InfrastructureAnalyzer.getLocalMaxMemory();
 		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
 	}
 	
-	public static double getRemoteMemBudgetMap()
-	{
+	public static double getRemoteMemBudgetMap() {
 		return getRemoteMemBudgetMap(false);
 	}
 	
-	public static double getRemoteMemBudgetMap(boolean substractSortBuffer)
-	{
+	public static double getRemoteMemBudgetMap(boolean substractSortBuffer) {
 		double ret = InfrastructureAnalyzer.getRemoteMaxMemoryMap();
 		if( substractSortBuffer )
 			ret -= InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
 		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
 	}
 
-	public static double getRemoteMemBudgetReduce()
-	{
+	public static double getRemoteMemBudgetReduce() {
 		double ret = InfrastructureAnalyzer.getRemoteMaxMemoryReduce();
 		return ret * OptimizerUtils.MEM_UTIL_FACTOR;
 	}
+	
+	public static boolean isMaxLocalParallelism(int k) {
+		return InfrastructureAnalyzer.getLocalParallelism() == k;
+	}
 
 	public static boolean checkSparkBroadcastMemoryBudget( double size )
 	{

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index a6a9d66..7b7c685 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -177,9 +177,10 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 				}
 				else //default unary 
 				{
-					int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
-					Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), 
-							                 getDataType(), getValueType(), et, k);
+					int k = isCumulativeUnaryOperation() || isExpensiveUnaryOperation() ?
+						OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
+					Unary unary1 = new Unary(input.constructLops(),
+						HopsOpOp1LopsU.get(_op), getDataType(), getValueType(), et, k);
 					setOutputDimensions(unary1);
 					setLineNumbers(unary1);
 					setLops(unary1);
@@ -612,21 +613,26 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 		return ( _op == OpOp1.INVERSE );
 	}
 
-	public boolean isCumulativeUnaryOperation() 
-	{
-		return (   _op == OpOp1.CUMSUM 
-				|| _op == OpOp1.CUMPROD
-				|| _op == OpOp1.CUMMIN
-				|| _op == OpOp1.CUMMAX  );
+	public boolean isCumulativeUnaryOperation()  {
+		return (_op == OpOp1.CUMSUM 
+			|| _op == OpOp1.CUMPROD
+			|| _op == OpOp1.CUMMIN
+			|| _op == OpOp1.CUMMAX);
 	}
 
 	public boolean isCastUnaryOperation() {
-		return (   _op == OpOp1.CAST_AS_MATRIX
-				|| _op == OpOp1.CAST_AS_SCALAR
-				|| _op == OpOp1.CAST_AS_FRAME
-				|| _op == OpOp1.CAST_AS_BOOLEAN
-				|| _op == OpOp1.CAST_AS_DOUBLE
-				|| _op == OpOp1.CAST_AS_INT    );
+		return (_op == OpOp1.CAST_AS_MATRIX
+			|| _op == OpOp1.CAST_AS_SCALAR
+			|| _op == OpOp1.CAST_AS_FRAME
+			|| _op == OpOp1.CAST_AS_BOOLEAN
+			|| _op == OpOp1.CAST_AS_DOUBLE
+			|| _op == OpOp1.CAST_AS_INT);
+	}
+	
+	public boolean isExpensiveUnaryOperation()  {
+		return (_op == OpOp1.EXP 
+			|| _op == OpOp1.LOG
+			|| _op == OpOp1.SIGMOID);
 	}
 	
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/lops/Unary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Unary.java b/src/main/java/org/apache/sysml/lops/Unary.java
index 7c1ceeb..a5403b2 100644
--- a/src/main/java/org/apache/sysml/lops/Unary.java
+++ b/src/main/java/org/apache/sysml/lops/Unary.java
@@ -325,11 +325,14 @@ public class Unary extends Lop
 		}
 	}
 	
-	public static boolean isCumulativeOp(OperationTypes op) {
+	public static boolean isMultiThreadedOp(OperationTypes op) {
 		return op==OperationTypes.CUMSUM
 			|| op==OperationTypes.CUMPROD
 			|| op==OperationTypes.CUMMIN
-			|| op==OperationTypes.CUMMAX;
+			|| op==OperationTypes.CUMMAX
+			|| op==OperationTypes.EXP
+			|| op==OperationTypes.LOG
+			|| op==OperationTypes.SIGMOID;
 	}
 	
 	@Override
@@ -351,7 +354,7 @@ public class Unary extends Lop
 		sb.append( prepOutputOperand(output) );
 		
 		//num threads for cumulative cp ops
-		if( getExecType() == ExecType.CP && isCumulativeOp(operation) ) {
+		if( getExecType() == ExecType.CP && isMultiThreadedOp(operation) ) {
 			sb.append( OPERAND_DELIMITOR );
 			sb.append( _numThreads );
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index e13f2a7..a7f5834 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -1317,7 +1317,8 @@ public class OptimizerRuleBased extends Optimizer
 							 && !HopRewriteUtils.isValidOp(((ParameterizedBuiltinOp)h).getOp(), 
 								ParamBuiltinOp.GROUPEDAGG, ParamBuiltinOp.REXPAND))
 						&& !( h instanceof UnaryOp //only unaryop-cumulativeagg
-							 && !((UnaryOp)h).isCumulativeUnaryOperation() )
+							 && !((UnaryOp)h).isCumulativeUnaryOperation()
+							 && !((UnaryOp)h).isExpensiveUnaryOperation())
 						&& !( h instanceof ReorgOp //only reorgop-transpose
 							 && ((ReorgOp)h).getOp() != ReOrgOp.TRANS ))
 					{

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 9dbb24e..395a4ec 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -64,6 +64,7 @@ import org.apache.sysml.runtime.instructions.cp.UaggOuterChainCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.UnaryCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
 import org.apache.sysml.runtime.instructions.cpfile.MatrixIndexingCPFileInstruction;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class CPInstructionParser extends InstructionParser 
 {
@@ -385,7 +386,8 @@ public class CPInstructionParser extends InstructionParser
 			case Builtin: 
 				String []parts = InstructionUtils.getInstructionPartsWithValueType(str);
 				if ( parts[0].equals("log") || parts[0].equals("log_nz") ) {
-					if ( parts.length == 3 ) {
+					if ( parts.length == 3 || (parts.length == 4 &&
+						UtilFunctions.isIntegerNumber(parts[3])) ) {
 						// B=log(A), y=log(x)
 						return UnaryCPInstruction.parseInstruction(str);
 					} else if ( parts.length == 4 ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/runtime/instructions/cp/UnaryCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/UnaryCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/UnaryCPInstruction.java
index 07be2a7..8e023ea 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/UnaryCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/UnaryCPInstruction.java
@@ -62,7 +62,7 @@ public abstract class UnaryCPInstruction extends ComputationCPInstruction {
 			out.split(parts[2]);
 			func = Builtin.getBuiltinFnObject(opcode);
 			
-			if( Arrays.asList(new String[]{"ucumk+","ucum*","ucummin","ucummax"}).contains(opcode) )
+			if( Arrays.asList(new String[]{"ucumk+","ucum*","ucummin","ucummax","exp","log","sigmoid"}).contains(opcode) )
 				return new UnaryMatrixCPInstruction(new UnaryOperator(func,Integer.parseInt(parts[3])), in, out, opcode, str); 
 			else
 				return new UnaryScalarCPInstruction(null, in, out, opcode, str);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4c7640b8/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index f738227..bb5e79b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -2577,16 +2577,25 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			ret.reset(rlen, clen, sp);
 		
 		//core execute
-		if( LibMatrixAgg.isSupportedUnaryOperator(op) ) 
-		{
+		if( LibMatrixAgg.isSupportedUnaryOperator(op) ) {
 			//e.g., cumsum/cumprod/cummin/cumax
 			if( op.getNumThreads() > 1 )
 				LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op, op.getNumThreads());
 			else
 				LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op);
 		}
-		else
-		{
+		else if(!sparse && !isEmptyBlock(false) && getDenseBlock().isContiguous()
+			&& OptimizerUtils.isMaxLocalParallelism(op.getNumThreads())) {
+			//note: we apply multi-threading in a best-effort manner here
+			//only for expensive operators such as exp, log, sigmoid, because
+			//otherwise allocation, read and write anyway dominates
+			ret.allocateDenseBlock(false);
+			double[] a = getDenseBlockValues();
+			double[] c = ret.getDenseBlockValues();
+			Arrays.parallelSetAll(c, i -> op.fn.execute(a[i]));
+			ret.recomputeNonZeros();
+		}
+		else {
 			//default execute unary operations
 			if(op.sparseSafe)
 				sparseUnaryOperations(op, ret);


[5/7] systemml git commit: [SYSTEMML-2261] Performance sparse tsmm (CSR with empty/dense rows)

Posted by mb...@apache.org.
[SYSTEMML-2261] Performance sparse tsmm (CSR with empty/dense rows)

This patch significantly improves performance for sparse tsmm over
special sparse matrices in CSR format, where all rows are either empty
or completely dense. The trick is that we can simply take the CSR values
array, count the number of dense rows, and use it as a "dense" block for
existing dense tsmm operations. This greatly improves performance
because existing dense operations usually perform much close at peak
performance due to better cache-conscious implementations and avoiding
unnecessary gather and scatter operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba06d053
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba06d053
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba06d053

Branch: refs/heads/master
Commit: ba06d0534ba07a03ee1a9ed15c2034043b28c928
Parents: 2b8161d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 00:02:12 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 00:02:12 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 65 +++++++++++-----
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 11 ++-
 ...llMatrixMultiplicationTransposeSelfTest.java | 80 ++++++--------------
 3 files changed, 76 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 3a2c58e..622f45c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -26,6 +26,7 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
+import java.util.stream.IntStream;
 
 import org.apache.commons.math3.util.FastMath;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -337,7 +338,6 @@ public class LibMatrixMult
 		//Timing time = new Timing(true);
 		
 		//pre-processing
-		m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
 		ret.sparse = false;
 		ret.allocateDenseBlock();
 
@@ -370,7 +370,6 @@ public class LibMatrixMult
 		//Timing time = new Timing(true);
 		
 		//pre-processing (no need to check isThreadSafe)
-		m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
 		ret.sparse = false;
 		ret.allocateDenseBlock();
 	
@@ -1839,28 +1838,36 @@ public class LibMatrixMult
 		SparseBlock a = m1.sparseBlock;
 		DenseBlock c = ret.getDenseBlock();
 		int m = m1.rlen;
-
+		
 		if( leftTranspose ) // t(X)%*%X 
 		{
 			//only general case (because vectors always dense)
 			//algorithm: scan rows, foreach row self join (KIJ)
-			if( LOW_LEVEL_OPTIMIZATION )
-			{
-				int arlen = a.numRows();
+			if( LOW_LEVEL_OPTIMIZATION ) {
+				final int n = m1.clen;
+				final int arlen = a.numRows();
 				for( int r=0; r<arlen; r++ ) {
 					if( a.isEmpty(r) ) continue;
-					int apos = a.pos(r);
 					int alen = a.size(r);
-					int[] aix = a.indexes(r);
 					double[] avals = a.values(r);
-					int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
-					rlix = (rlix>=0) ? apos+rlix : apos+alen;
-					int len = apos + alen;
-					for(int i = rlix; i < len && aix[i]<ru; i++) {
-						double val = avals[i];
-						if( val != 0 )
-							vectMultiplyAdd(val, avals, c.values(aix[i]),
-								aix, i, c.pos(aix[i]), len-i);
+					if( alen == n ) { //dense row
+						for( int i=rl; i<ru; i++ ) {
+							vectMultiplyAdd(avals[i], avals,
+								c.values(i), i, c.pos(i), n-i);
+						}
+					}
+					else { //non-full sparse row
+						int apos = a.pos(r);
+						int[] aix = a.indexes(r);
+						int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl);
+						rlix = (rlix>=0) ? apos+rlix : apos+alen;
+						int len = apos + alen;
+						for(int i = rlix; i < len && aix[i]<ru; i++) {
+							double val = avals[i];
+							if( val != 0 )
+								vectMultiplyAdd(val, avals, c.values(aix[i]),
+									aix, i, c.pos(aix[i]), len-i);
+						}
 					}
 				}
 			}
@@ -3666,16 +3673,34 @@ public class LibMatrixMult
 		return nnz;
 	}
 
-	private static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose ) {
+	public static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose, boolean par ) {
 		MatrixBlock ret = m1;
+		final int rlen = m1.rlen;
+		final int clen = m1.clen;
 		
-		if( !leftTranspose && m1.sparse && m1.rlen > 1) //X%*%t(X) SPARSE MATRIX
-		{	
+		if( !leftTranspose && m1.sparse && rlen > 1) { //X%*%t(X) SPARSE MATRIX
 			//directly via LibMatrixReorg in order to prevent sparsity change
-			MatrixBlock tmpBlock = new MatrixBlock(m1.clen, m1.rlen, m1.sparse);
+			MatrixBlock tmpBlock = new MatrixBlock(clen, rlen, m1.sparse);
 			LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
 			ret = tmpBlock;
 		}
+		else if( leftTranspose && m1.sparse && m1.sparseBlock instanceof SparseBlockCSR ) {
+			//for a special case of CSR inputs where all non-empty rows are dense, we can
+			//create a shallow copy of the values arrays to a "dense" block and perform
+			//tsmm with the existing dense block operations w/o unnecessary gather/scatter
+			SparseBlockCSR sblock = (SparseBlockCSR)m1.sparseBlock;
+			boolean convertDense = (par ?
+				IntStream.range(0, rlen).parallel() : IntStream.range(0, rlen))
+				.allMatch(i -> sblock.isEmpty(i) || sblock.size(i)==clen );
+			if( convertDense ) {
+				int rows = (int) sblock.size() / clen;
+				MatrixBlock tmpBlock = new MatrixBlock(rows, clen, false);
+				tmpBlock.denseBlock = DenseBlockFactory
+					.createDenseBlock(sblock.values(), rows, clen);
+				tmpBlock.setNonZeros(m1.nonZeros);
+				ret = tmpBlock;
+			}
+		}
 		
 		return ret;
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 97f883b..f738227 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3433,13 +3433,18 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		else
 			out.reset(dim, dim, false);
 		
+		//pre=processing (outside LibMatrixMult for seamless integration
+		//with native BLAS library, e.g., for sparse-dense conversion)
+		MatrixBlock m1 = LibMatrixMult
+			.prepMatrixMultTransposeSelfInput(this, leftTranspose, k > 1);
+		
 		//compute matrix mult
 		if( NativeHelper.isNativeLibraryLoaded() )
-			LibMatrixNative.tsmm(this, out, leftTranspose, k);
+			LibMatrixNative.tsmm(m1, out, leftTranspose, k);
 		else if( k > 1 )
-			LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose, k);
+			LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose, k);
 		else
-			LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose);
+			LibMatrixMult.matrixMultTransposeSelf(m1, out, leftTranspose);
 		
 		return out;
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/ba06d053/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
index db0d0e0..5ae480e 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix_full_other/FullMatrixMultiplicationTransposeSelfTest.java
@@ -35,7 +35,6 @@ import org.apache.sysml.test.utils.TestUtils;
 
 public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase 
 {
-
 	private final static String TEST_NAME1 = "TransposeSelfMatrixMultiplication1";
 	private final static String TEST_NAME2 = "TransposeSelfMatrixMultiplication2";
 	private final static String TEST_DIR = "functions/binary/matrix_full_other/";
@@ -46,7 +45,7 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase
 	private final static int rows1 = 3500;
 	private final static int cols1 = 1500;
 	//for MR
-	private final static int rows2 = 7000;//7000;  
+	private final static int rows2 = 7000;//7000;
 	private final static int cols2 = 750;//750; 
 	
 	private final static double sparsity1 = 0.7;
@@ -70,121 +69,97 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase
 	}
 
 	@BeforeClass
-	public static void init()
-	{
+	public static void init() {
 		TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
 	}
 
 	@AfterClass
-	public static void cleanUp()
-	{
+	public static void cleanUp() {
 		if (TEST_CACHE_ENABLED) {
 			TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
 		}
 	}
 
 	@Test
-	public void testMMLeftDenseCP() 
-	{
+	public void testMMLeftDenseCP() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.CP, false);
 	}
 	
 	@Test
-	public void testMMRightDenseCP() 
-	{
+	public void testMMRightDenseCP() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, false);
 	}
 
 	@Test
-	public void testMMLeftSparseCP() 
-	{
+	public void testMMLeftSparseCP() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.CP, true);
 	}
 	
 	@Test
-	public void testMMRightSparseCP() 
-	{
+	public void testMMRightSparseCP() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, true);
 	}
 	
 	@Test
-	public void testMMLeftDenseMR() 
-	{
+	public void testMMLeftDenseMR() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.MR, false);
 	}
 	
 	@Test
-	public void testMMRightDenseMR() 
-	{
+	public void testMMRightDenseMR() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, false);
 	}
 
 	@Test
-	public void testMMLeftSparseMR() 
-	{
+	public void testMMLeftSparseMR() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.LEFT, ExecType.MR, true);
 	}
 	
 	@Test
-	public void testMMRightSparseMR() 
-	{
+	public void testMMRightSparseMR() {
 		runTransposeSelfMatrixMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, true);
 	}	
 	
 	@Test
-	public void testVVLeftDenseCP() 
-	{
+	public void testVVLeftDenseCP() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.CP, false);
 	}
 	
 	@Test
-	public void testVVRightDenseCP() 
-	{
+	public void testVVRightDenseCP() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, false);
 	}
 
 	@Test
-	public void testVVLeftSparseCP() 
-	{
+	public void testVVLeftSparseCP() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.CP, true);
 	}
 	
 	@Test
-	public void testVVRightSparseCP() 
-	{
+	public void testVVRightSparseCP() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.CP, true);
 	}
 	
 	@Test
-	public void testVVLeftDenseMR() 
-	{
+	public void testVVLeftDenseMR() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.MR, false);
 	}
 	
 	@Test
-	public void testVVRightDenseMR() 
-	{
+	public void testVVRightDenseMR() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, false);
 	}
 
 	@Test
-	public void testVVLeftSparseMR() 
-	{
+	public void testVVLeftSparseMR() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.LEFT, ExecType.MR, true);
 	}
 	
 	@Test
-	public void testVVRightSparseMR() 
-	{
+	public void testVVRightSparseMR() {
 		runTransposeSelfVectorMultiplicationTest(MMTSJType.RIGHT, ExecType.MR, true);
 	}
 
-	/**
-	 * 
-	 * @param type
-	 * @param instType
-	 * @param sparse
-	 */
 	private void runTransposeSelfMatrixMultiplicationTest( MMTSJType type, ExecType instType, boolean sparse )
 	{
 		//setup exec type, rows, cols
@@ -251,18 +226,11 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase
 			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("B");
 			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
 		}
-		finally
-		{
+		finally {
 			rtplatform = platformOld;
 		}
 	}
 	
-	/**
-	 * 
-	 * @param type
-	 * @param instType
-	 * @param sparse
-	 */
 	private void runTransposeSelfVectorMultiplicationTest( MMTSJType type, ExecType instType, boolean sparse )
 	{
 		//setup exec type, rows, cols
@@ -329,10 +297,8 @@ public class FullMatrixMultiplicationTransposeSelfTest extends AutomatedTestBase
 			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("B");
 			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
 		}
-		finally
-		{
+		finally {
 			rtplatform = platformOld;
 		}
-	}	
-	
-}
\ No newline at end of file
+	}
+}


[6/7] systemml git commit: [SYSTEMML-2263] Fix too conservative sparse block size estimates

Posted by mb...@apache.org.
[SYSTEMML-2263] Fix too conservative sparse block size estimates

This patch fixes outdated and too conservative size estimates of all
sparse block formats. In contrast to previous assumptions, we now assume
an array to require a header of 24 bytes and every object 16 bytes
(which is still sufficiently conservative wrt potential padding).
Overall, this improves performance in two dimensions: (1) avoid
unnecessary distributed operations although the sparse matrix would fit
in CP, and (2) avoid unnecessary buffer pool serialization due to
exceeded in-memory/on-size size overhead thresholds.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/45d86bd2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/45d86bd2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/45d86bd2

Branch: refs/heads/master
Commit: 45d86bd20bac85bd1129813219925a2a8cbdf45a
Parents: ba06d05
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Apr 20 00:37:39 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Apr 20 00:37:39 2018 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/SparseBlockCOO.java       |  6 +++---
 .../sysml/runtime/matrix/data/SparseBlockCSR.java       |  6 +++---
 .../sysml/runtime/matrix/data/SparseBlockMCSR.java      | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/45d86bd2/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCOO.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCOO.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCOO.java
index b2a234a..1c8e3fe 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCOO.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCOO.java
@@ -146,9 +146,9 @@ public class SparseBlockCOO extends SparseBlock
 		
 		//32B overhead per array, int/int/double arr in nnz 
 		double size = 16 + 8;   //object + 2 int fields
-		size += 32 + lnnz * 4d; //rindexes array (row indexes)
-		size += 32 + lnnz * 4d; //cindexes array (column indexes)
-		size += 32 + lnnz * 8d; //values array (non-zero values)
+		size += 24 + lnnz * 4d; //rindexes array (row indexes)
+		size += 24 + lnnz * 4d; //cindexes array (column indexes)
+		size += 24 + lnnz * 8d; //values array (non-zero values)
 		
 		//robustness for long overflows
 		return (long) Math.min(size, Long.MAX_VALUE);

http://git-wip-us.apache.org/repos/asf/systemml/blob/45d86bd2/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
index 6bbc81d..1365f95 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
@@ -269,9 +269,9 @@ public class SparseBlockCSR extends SparseBlock
 		
 		//32B overhead per array, int arr in nrows, int/double arr in nnz 
 		double size = 16 + 4;        //object + int field
-		size += 32 + (nrows+1) * 4d; //ptr array (row pointers)
-		size += 32 + lnnz * 4d;      //indexes array (column indexes)
-		size += 32 + lnnz * 8d;      //values array (non-zero values)
+		size += 24 + (nrows+1) * 4d; //ptr array (row pointers)
+		size += 24 + lnnz * 4d;      //indexes array (column indexes)
+		size += 24 + lnnz * 8d;      //values array (non-zero values)
 		
 		//robustness for long overflows
 		return (long) Math.min(size, Long.MAX_VALUE);

http://git-wip-us.apache.org/repos/asf/systemml/blob/45d86bd2/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockMCSR.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockMCSR.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockMCSR.java
index fe63f2b..4cbf49a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockMCSR.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockMCSR.java
@@ -102,14 +102,14 @@ public class SparseBlockMCSR extends SparseBlock
 		double cnnz = Math.max(SparseRowVector.initialCapacity, Math.ceil(sparsity*ncols));
 		double rlen = Math.min(nrows, Math.ceil(sparsity*nrows*ncols));
 		
-		//Each sparse row has a fixed overhead of 8B (reference) + 32B (object) +
-		//12B (3 int members), 32B (overhead int array), 32B (overhead double array),
+		//Each sparse row has a fixed overhead of 16B (object) + 12B (3 ints),
+		//24B (int array), 24B (double array), i.e., in total 76B
 		//Each non-zero value requires 12B for the column-index/value pair.
 		//Overheads for arrays, objects, and references refer to 64bit JVMs
-		//If nnz < than rows we have only also empty rows.
-		double size = 16;                 //object
-		size += rlen * (116 + cnnz * 12); //sparse rows
-		size += 32 + nrows * 8d;          //references
+		//If nnz < rows we have guaranteed also empty rows.
+		double size = 16;                //object
+		size += 24 + nrows * 8d;         //references
+		size += rlen * (76 + cnnz * 12); //sparse rows
 		
 		// robustness for long overflows
 		return (long) Math.min(size, Long.MAX_VALUE);


[2/7] systemml git commit: [SYSTEMML-2258] Fine tuning and cleanup bufferpool (thread contention)

Posted by mb...@apache.org.
[SYSTEMML-2258] Fine tuning and cleanup bufferpool (thread contention)

This patch makes some additional improvements to the buffer pool
primitives acquireRead, acquireModify, and release which help further
reduce unnecessary thread contention on shared objects. As a by product
this also improves the scope of related stats reporting and a cleanup of
these primitives to simplify their maintenance.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5590513d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5590513d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5590513d

Branch: refs/heads/master
Commit: 5590513d7d4ec8acad218bee7f2c3239c13042eb
Parents: 9a08915
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Apr 19 19:27:26 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Apr 19 19:27:26 2018 -0700

----------------------------------------------------------------------
 .../sysml/debug/DMLDebuggerFunctions.java       |  15 +-
 .../apache/sysml/hops/cost/CostEstimator.java   |   3 +-
 .../controlprogram/caching/CacheableData.java   | 192 +++++--------------
 .../context/ExecutionContext.java               |  34 ++--
 .../parfor/opt/OptimizerRuleBased.java          |   2 +-
 5 files changed, 80 insertions(+), 166 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5590513d/src/main/java/org/apache/sysml/debug/DMLDebuggerFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/debug/DMLDebuggerFunctions.java b/src/main/java/org/apache/sysml/debug/DMLDebuggerFunctions.java
index d32d0e5..c34272f 100644
--- a/src/main/java/org/apache/sysml/debug/DMLDebuggerFunctions.java
+++ b/src/main/java/org/apache/sysml/debug/DMLDebuggerFunctions.java
@@ -31,6 +31,7 @@ import org.apache.sysml.lops.Lop;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.caching.CacheableData.CacheStatus;
 import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.MRJobInstruction;
 import org.apache.sysml.runtime.instructions.cp.BreakPointInstruction;
@@ -291,7 +292,7 @@ public class DMLDebuggerFunctions {
 						
 						try {
 							mo = (MatrixObject) variables.get(varname);
-							if (mo.getStatusAsString().equals("EMPTY") && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
+							if (mo.getStatus()==CacheStatus.EMPTY && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
 								//TODO @jlugoma Need to add functionality to bring and display a block. 
 								System.err.println("ERROR: Matrix dimensions are too large to fit in main memory.");
 								return;
@@ -367,7 +368,7 @@ public class DMLDebuggerFunctions {
 				if (variables.get(varname).getDataType() == DataType.MATRIX) {
 					try {
 						MatrixObject mo = (MatrixObject) variables.get(varname);
-						if (mo.getStatusAsString().equals("EMPTY") && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
+						if (mo.getStatus()==CacheStatus.EMPTY && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
 							//TODO @jlugoma Need to add functionality to bring and display a block. 
 							System.err.println("ERROR: DML matrix/vector dimensions are too large to fit in main memory.");
 							return;
@@ -417,7 +418,7 @@ public class DMLDebuggerFunctions {
 					double cellValue;
 					try {
 						MatrixObject mo = (MatrixObject) variables.get(varname);
-						if (mo.getStatusAsString().equals("EMPTY") && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
+						if (mo.getStatus()==CacheStatus.EMPTY && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
 							//TODO @jlugoma Need to add functionality to bring and display a block. 
 							System.err.println("ERROR: DML matrix/vector dimensions are too large to fit in main memory.");
 							return;
@@ -464,14 +465,16 @@ public class DMLDebuggerFunctions {
 					double updatedCellValue;
 					try {
 						MatrixObject mo = (MatrixObject) variables.get(varname);
-						if (mo.getStatusAsString().equals("EMPTY") && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
+						if (mo.getStatus()==CacheStatus.EMPTY && (OptimizerUtils.estimateSizeExactSparsity(mo.getNumRows(), mo.getNumColumns(), mo.getSparsity()) > OptimizerUtils.getLocalMemBudget())) {
 							//TODO @jlugoma Need to add functionality to bring and display a block. 
 							System.err.println("ERROR: DML matrix/vector dimensions are too large to fit in main memory.");
 							return;
-						}						
-						MatrixBlock mb = mo.acquireModify();
+						}
+						MatrixBlock mb = mo.acquireRead();
+						mo.release();
 						mb.setValue(rowIndex, columnIndex, value);
 						updatedCellValue = mb.getValue(rowIndex, columnIndex);
+						mo.acquireModify(mb);
 						mo.release();
 					} catch (Exception e) {
 						System.err.println("Error processing DML matrix variable "+varname+". Certain matrix operations are disabled due to memory constraints or read-only restrictions.");

http://git-wip-us.apache.org/repos/asf/systemml/blob/5590513d/src/main/java/org/apache/sysml/hops/cost/CostEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/cost/CostEstimator.java b/src/main/java/org/apache/sysml/hops/cost/CostEstimator.java
index 37e588f..05c6906 100644
--- a/src/main/java/org/apache/sysml/hops/cost/CostEstimator.java
+++ b/src/main/java/org/apache/sysml/hops/cost/CostEstimator.java
@@ -39,6 +39,7 @@ import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
 import org.apache.sysml.runtime.controlprogram.Program;
 import org.apache.sysml.runtime.controlprogram.ProgramBlock;
 import org.apache.sysml.runtime.controlprogram.WhileProgramBlock;
+import org.apache.sysml.runtime.controlprogram.caching.CacheableData.CacheStatus;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
@@ -223,7 +224,7 @@ public abstract class CostEstimator
 				int brlen = mc.getRowsPerBlock();
 				int bclen = mc.getColsPerBlock();
 				long nnz = mc.getNonZeros();
-				boolean inmem = mo.getStatusAsString().equals("CACHED");
+				boolean inmem = mo.getStatus()==CacheStatus.CACHED;
 				vs = new VarStats(rlen, clen, brlen, bclen, nnz, inmem);
 			}
 			else //scalar

http://git-wip-us.apache.org/repos/asf/systemml/blob/5590513d/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 6e63284..3682fe1 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -40,7 +40,6 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
-import org.apache.sysml.runtime.instructions.cp.CPInstruction;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
@@ -53,11 +52,9 @@ import org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer;
 import org.apache.sysml.runtime.matrix.MetaData;
 import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
 import org.apache.sysml.runtime.util.LocalFileUtils;
 import org.apache.sysml.runtime.util.MapReduceTool;
-import org.apache.sysml.utils.GPUStatistics;
 
 
 /**
@@ -103,7 +100,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	 * <code>CACHED</code>:   The data blob is in main memory, and nobody is using nor referencing it. 
 	 * There is always an persistent recovery object for it
 	 **/
-	protected enum CacheStatus {
+	public enum CacheStatus {
 		EMPTY, 
 		READ, 
 		MODIFY, 
@@ -115,7 +112,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	private static volatile boolean _activeFlag = false;
 	
 	/** Global sequence for generating unique ids. */
-	private static IDSequence _seq = null;   
+	private static IDSequence _seq = null;
 
 	// Global eviction path and prefix (prefix used for isolation purposes)
 	public static String cacheEvictionLocalFilePath = null; //set during init
@@ -246,6 +243,10 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	public boolean isCleanupEnabled() {
 		return _cleanupFlag;
 	}
+	
+	public CacheStatus getStatus() {
+		return _cacheStatus;
+	}
 
 	public boolean isHDFSFileExists() {
 		return _hdfsFileExists;
@@ -411,18 +412,15 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 				}
 			}
 		}
-
+		
 		//read data from HDFS/RDD if required
 		//(probe data for cache_nowrite / jvm_reuse)
-		if( isEmpty(true) && _data==null )
-		{
-			try
-			{
+		if( _data==null && isEmpty(true) ) {
+			try {
 				if( DMLScript.STATISTICS )
 					CacheStatistics.incrementHDFSHits();
 				
-				if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() )
-				{
+				if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) {
 					//check filename
 					if( _hdfsFileName == null )
 						throw new DMLRuntimeException("Cannot read matrix for empty filename.");
@@ -433,8 +431,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 					//mark for initial local write despite read operation
 					_requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ;
 				}
-				else
-				{
+				else {
 					//read matrix from rdd (incl execute pending rdd operations)
 					MutableBoolean writeStatus = new MutableBoolean();
 					_data = readBlobFromRDD( getRDDHandle(), writeStatus );
@@ -449,92 +446,48 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 			catch (IOException e) {
 				throw new DMLRuntimeException("Reading of " + _hdfsFileName + " ("+hashCode()+") failed.", e);
 			}
-			
 			_isAcquireFromEmpty = true;
 		}
-		else if( DMLScript.STATISTICS ) {
-			if( _data!=null )
-				CacheStatistics.incrementMemHits();
+		else if( _data!=null && DMLScript.STATISTICS ) {
+			CacheStatistics.incrementMemHits();
 		}
 		
 		//cache status maintenance
 		acquire( false, _data==null );
 		return _data;
 	}
-
+	
 	/**
-	 * Acquires the exclusive "write" lock for a thread that wants to change cache block
-	 * cell values.  Produces the reference to the cache block, restores the cache block
-	 * to main memory, reads from HDFS if needed.
-	 * 
+	 * Acquires the exclusive "write" lock for a thread that wants to throw away the
+	 * old cache block data and link up with new cache block data. Abandons the old data
+	 * without reading it and sets the new data reference.
+
 	 * In-Status:  EMPTY, EVICTABLE, EVICTED;
 	 * Out-Status: MODIFY.
 	 * 
+	 * @param newData new data
 	 * @return cacheable data
 	 */
-	public synchronized T acquireModify() 
-	{
-		//TODO remove after debugger (as only consumer) has been removed, because
-		//recent features such as gpu data transfers are not yet integrated
+	public T acquireModify(T newData) {
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		
-		if ( !isAvailableToModify() )
-			throw new DMLRuntimeException("MatrixObject not available to modify.");
+		//core internal acquire (synchronized per object)
+		T ret = acquireModifyIntern(newData);
 		
-		//get object from cache
-		if( _data == null )
-			getCache();
-
-		//read data from HDFS if required
-		if( isEmpty(true) && _data == null )
-		{
-			//check filename
-			if( _hdfsFileName == null )
-				throw new DMLRuntimeException("Cannot read matrix for empty filename.");
-			
-			//load data
-			try {
-				_data = readBlobFromHDFS( _hdfsFileName );
-			}
-			catch (IOException e) {
-				throw new DMLRuntimeException("Reading of " + _hdfsFileName + " ("+hashCode()+") failed.", e);
-			}
-		}
-
-		//cache status maintenance
-		acquire( true, _data==null );
-		updateStatusPinned(true);
-		setDirty(true);
-		_isAcquireFromEmpty = false;
+		//update thread-local status (after pin but outside the
+		//critical section of accessing a shared object)
+		if( !isBelowCachingThreshold() )
+			updateStatusPinned(true);
 		
 		if( DMLScript.STATISTICS ){
 			long t1 = System.nanoTime();
 			CacheStatistics.incrementAcquireMTime(t1-t0);
 		}
 		
-		return _data;
-	}
-	
-	public T acquireModify(T newData) {
-		return acquireModify(newData, null);
+		return ret;
 	}
 	
-	/**
-	 * Acquires the exclusive "write" lock for a thread that wants to throw away the
-	 * old cache block data and link up with new cache block data. Abandons the old data
-	 * without reading it and sets the new data reference.
-
-	 * In-Status:  EMPTY, EVICTABLE, EVICTED;
-	 * Out-Status: MODIFY.
-	 * 
-	 * @param newData new data
-	 * @param opcode extended instruction opcode
-	 * @return cacheable data
-	 */
-	public synchronized T acquireModify(T newData, String opcode)
-	{
-		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
-		
+	private synchronized T acquireModifyIntern(T newData) {
 		if (! isAvailableToModify ())
 			throw new DMLRuntimeException("CacheableData not available to modify.");
 		
@@ -550,29 +503,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 		//set references to new data
 		if (newData == null)
 			throw new DMLRuntimeException("acquireModify with empty cache block.");
-		_data = newData;
-		if( !isBelowCachingThreshold() )
-			updateStatusPinned(true);
-		
-		if( DMLScript.STATISTICS ){
-			long t1 = System.nanoTime();
-			CacheStatistics.incrementAcquireMTime(t1-t0);
-			if(DMLScript.FINEGRAINED_STATISTICS && opcode != null) {
-				if(_data instanceof MatrixBlock) {
-					MatrixBlock currObject = (MatrixBlock)_data;
-					if(currObject.isInSparseFormat())
-						GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_ACQ_MODIFY_SPARSE_MB, t1-t0);
-					else
-						GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_ACQ_MODIFY_DENSE_MB, t1-t0);
-				}
-			}
-		}
-		
-		return _data;
-	}
-	
-	public void release() {
-		release(null);
+		return _data = newData;
 	}
 	
 	/**
@@ -586,7 +517,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	 * Out-Status: READ(-1), EVICTABLE, EMPTY.
 	 * 
 	 */
-	public void release(String opcode) {
+	public void release() {
 		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
 		
 		//update thread-local status (before unpin but outside
@@ -595,7 +526,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 			updateStatusPinned(false);
 		
 		//core internal release (synchronized per object)
-		releaseIntern(opcode);
+		releaseIntern();
 		
 		if( DMLScript.STATISTICS ){
 			long t1 = System.nanoTime();
@@ -603,8 +534,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 		}
 	}
 	
-	private synchronized void releaseIntern(String opcode)
-	{
+	private synchronized void releaseIntern() {
 		boolean write = false;
 		if ( isModify() ) {
 			//set flags for write
@@ -613,11 +543,11 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 			
 			//update meta data
 			refreshMetaData();
+			
+			//compact empty in-memory block 
+			_data.compactEmptyBlock();
 		}
 		
-		//compact empty in-memory block 
-		_data.compactEmptyBlock();
-		
 		//cache status maintenance (pass cacheNoWrite flag)
 		release(_isAcquireFromEmpty && !_requiresLocalWrite);
 		
@@ -626,17 +556,9 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 			&& !isBelowCachingThreshold() ) //min size for caching
 		{
 			if( write || _requiresLocalWrite ) {
-				//evict blob
 				String filePath = getCacheFilePathAndName();
 				try {
-					long t1 = DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-					
-					int numEvicted = LazyWriteBuffer.writeBlock(filePath, _data);
-					
-					if(DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS && opcode != null) {
-						long t2 = DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-						GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_RELEASE_BUFF_WRITE, t2-t1, numEvicted);
-					}
+					LazyWriteBuffer.writeBlock(filePath, _data);
 				}
 				catch (Exception e) {
 					throw new DMLRuntimeException("Eviction to local path " + filePath + " ("+hashCode()+") failed.", e);
@@ -666,7 +588,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 			return; // do nothing
 		if( !isAvailableToModify() )
 			throw new DMLRuntimeException("CacheableData (" + getDebugName() + ") not available to "
-					+ "modify. Status = " + getStatusAsString() + ".");
+					+ "modify. Status = " + _cacheStatus.name() + ".");
 		
 		// clear existing WB / FS representation (but prevent unnecessary probes)
 		if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold()) 
@@ -810,7 +732,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 				throw new DMLRuntimeException("Export to " + fName + " failed.", e);
 			}
 			finally {
-				release(opcode);
+				release();
 			}
 		}
 		else if( pWrite ) // pwrite with same output format
@@ -1078,7 +1000,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 		}
 
 		if( LOG.isTraceEnabled() )
-			LOG.trace("Acquired lock on " + this.getDebugName() + ", status: " + this.getStatusAsString() );		
+			LOG.trace("Acquired lock on " + getDebugName() + ", status: " + _cacheStatus.name() );
 	}
 
 	
@@ -1114,7 +1036,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 		}
 		
 		if( LOG.isTraceEnabled() )
-			LOG.trace("Released lock on " + this.getDebugName() + ", status: " + this.getStatusAsString());
+			LOG.trace("Released lock on " + getDebugName() + ", status: " + _cacheStatus.name());
 		
 	}
 
@@ -1125,16 +1047,9 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	//  ***                                            ***
 	//  **************************************************
 	
-	
-	public String getStatusAsString() {
-		return _cacheStatus.toString();
-	}
-
 	public boolean isCached(boolean inclCachedNoWrite) {
-		if( inclCachedNoWrite )
-			return (_cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE);
-		else
-			return (_cacheStatus == CacheStatus.CACHED);
+		return _cacheStatus == CacheStatus.CACHED
+			|| (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE);
 	}
 	
 	public void setEmptyStatus() {
@@ -1142,10 +1057,8 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	}
 	
 	protected boolean isEmpty(boolean inclCachedNoWrite) {
-		if( inclCachedNoWrite )
-			return (_cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED_NOWRITE);
-		else
-			return (_cacheStatus == CacheStatus.EMPTY);
+		return _cacheStatus == CacheStatus.EMPTY
+			|| (inclCachedNoWrite && _cacheStatus == CacheStatus.CACHED_NOWRITE);
 	}
 	
 	protected boolean isModify() {
@@ -1170,22 +1083,19 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	}
 	
 	protected void removeOneRead(boolean doesBlobExist, boolean cacheNoWrite) {
-		_numReadThreads --;					
+		_numReadThreads --;
 		if (_numReadThreads == 0) {
 			if( cacheNoWrite )
 				_cacheStatus = (doesBlobExist ? 
-						CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY);
+					CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY);
 			else
 				_cacheStatus = (doesBlobExist ? 
-						CacheStatus.CACHED : CacheStatus.EMPTY);
+					CacheStatus.CACHED : CacheStatus.EMPTY);
 		}
 	}
 	
 	protected boolean isAvailableToRead() {
-		return (   _cacheStatus == CacheStatus.EMPTY 
-				|| _cacheStatus == CacheStatus.CACHED
-				|| _cacheStatus == CacheStatus.CACHED_NOWRITE
-				|| _cacheStatus == CacheStatus.READ);
+		return (_cacheStatus != CacheStatus.MODIFY);
 	}
 	
 	protected boolean isAvailableToModify() {
@@ -1206,7 +1116,8 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	 * referenced cache block.  
 	 */
 	protected void createCache( ) {
-		_cache = new SoftReference<>( _data );
+		if( _cache == null || _cache.get() == null )
+			_cache = new SoftReference<>( _data );
 	}
 
 	/**
@@ -1214,9 +1125,8 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
 	 * and subsequently clears the cache soft reference if existing.
 	 */
 	protected void getCache() {
-		if( _cache !=null ) {
+		if( _cache != null ) {
 			_data = _cache.get();
-			clearCache();
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/5590513d/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 9c6dc5b..dfc3dc5 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -380,21 +380,19 @@ public class ExecutionContext {
 	 * 
 	 * @param varName variable name
 	 */
+	public void releaseMatrixInput(String varName) {
+		getMatrixObject(varName).release();
+	}
+	
 	public void releaseMatrixInput(String varName, String opcode) {
 		long t1 = opcode != null && DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0;
-		MatrixObject mo = getMatrixObject(varName);
-		mo.release(opcode);
+		releaseMatrixInput(varName);
 		if(opcode != null && DMLScript.STATISTICS && DMLScript.FINEGRAINED_STATISTICS) {
 			long t2 = System.nanoTime();
 			GPUStatistics.maintainCPMiscTimes(opcode, CPInstruction.MISC_TIMER_RELEASE_INPUT_MB, t2-t1);
 		}
 	}
 	
-	public void releaseMatrixInput(String varName) {
-		MatrixObject mo = getMatrixObject(varName);
-		mo.release(null);
-	}
-	
 	public void releaseMatrixInputForGPUInstruction(String varName) {
 		MatrixObject mo = getMatrixObject(varName);
 		mo.getGPUObject(getGPUContext(0)).releaseInput();
@@ -450,25 +448,27 @@ public class ExecutionContext {
 	}
 	
 	public void setMatrixOutput(String varName, MatrixBlock outputData) {
-		setMatrixOutput(varName, outputData, null);
-	}
-
-	public void setMatrixOutput(String varName, MatrixBlock outputData, String opcode) {
 		MatrixObject mo = getMatrixObject(varName);
-		mo.acquireModify(outputData, opcode);
-		mo.release(opcode);
+		mo.acquireModify(outputData);
+		mo.release();
 		setVariable(varName, mo);
 	}
+	
+	public void setMatrixOutput(String varName, MatrixBlock outputData, String opcode) {
+		setMatrixOutput(varName, outputData);
+	}
 
-	public void setMatrixOutput(String varName, MatrixBlock outputData, UpdateType flag, String opcode) {
+	public void setMatrixOutput(String varName, MatrixBlock outputData, UpdateType flag) {
 		if( flag.isInPlace() ) {
 			//modify metadata to carry update status
 			MatrixObject mo = getMatrixObject(varName);
 			mo.setUpdateType( flag );
 		}
-		
-		//default case
-		setMatrixOutput(varName, outputData, opcode);
+		setMatrixOutput(varName, outputData);
+	}
+	
+	public void setMatrixOutput(String varName, MatrixBlock outputData, UpdateType flag, String opcode) {
+		setMatrixOutput(varName, outputData, flag);
 	}
 
 	public void setFrameOutput(String varName, FrameBlock outputData) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/5590513d/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index 91124a0..e13f2a7 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -2039,7 +2039,7 @@ public class OptimizerRuleBased extends Optimizer
 				//replace existing matrix object with empty matrix
 				MatrixObject mo = (MatrixObject)dat;
 				ec.cleanupCacheableData(mo);
-				ec.setMatrixOutput(rvar._name, new MatrixBlock((int)mo.getNumRows(), (int)mo.getNumColumns(),false), null);
+				ec.setMatrixOutput(rvar._name, new MatrixBlock((int)mo.getNumRows(), (int)mo.getNumColumns(),false));
 				
 				//keep track of cleaned result variables
 				cleanedVars.add(rvar);


[3/7] systemml git commit: [SYSTEMML2259] Performance dense-sparse MV binary multiply operations

Posted by mb...@apache.org.
[SYSTEMML2259] Performance dense-sparse MV binary multiply operations

This patch improves the performance of dense-dense and dense-sparse
binary multiply operations with sparse output, which so far fell back to
a generic MV case with binary search and reallocation overheads. First,
for MV mult with column vectors, we now use a new special case that
allocates the output directly into the memory-efficient CSR format.
Second, for MV mult with row vectors, we preallocate sparse rows.

On a scenario of 100x MV binary operations of the following shapes, the
cumulative runtime improved as follows:
a) 37,500x200 (dense) * 37,500x1 (dense, sp=0.05): 10.7s -> 1.3s
b) 200x37,500 (dense) * 1x37,500 (sparse, sp=0.05): 3.8s -> 2.1s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/fde708f5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/fde708f5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/fde708f5

Branch: refs/heads/master
Commit: fde708f5df93bd7dd1b2c523b43ee128a53d7809
Parents: 5590513
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Apr 19 21:15:33 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Apr 19 21:15:33 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 56 +++++++++++++++++---
 1 file changed, 49 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/fde708f5/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 5a93508..79b4c36 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -44,6 +44,7 @@ import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysml.runtime.util.DataConverter;
 import org.apache.sysml.runtime.util.SortUtils;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 /**
  * MB:
@@ -211,6 +212,10 @@ public class LibMatrixBincell
 				safeBinaryMVDense(m1, m2, ret, op);
 			else if( m1.sparse ) //SPARSE m1
 				safeBinaryMVSparse(m1, m2, ret, op);
+			else if( !m1.sparse && !m2.sparse && ret.sparse && op.fn instanceof Multiply
+				&& atype == BinaryAccessType.MATRIX_COL_VECTOR
+				&& (long)m1.rlen * m2.clen < Integer.MAX_VALUE )
+				safeBinaryMVDenseSparseMult(m1, m2, ret, op);
 			else //generic combinations
 				safeBinaryMVGeneric(m1, m2, ret, op);
 		}	
@@ -440,6 +445,44 @@ public class LibMatrixBincell
 		//no need to recomputeNonZeros since maintained in append value
 	}
 
+	private static void safeBinaryMVDenseSparseMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) {
+		if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
+			return;
+		int rlen = m1.rlen;
+		int clen = m1.clen;
+		BinaryAccessType atype = getBinaryAccessType(m1, m2);
+		double[] a = m1.getDenseBlockValues();
+		double[] b = m2.getDenseBlockValues();
+		
+		//note: invocation condition ensures max int nnz
+		if( atype == BinaryAccessType.MATRIX_COL_VECTOR ) {
+			//count output nnz (for CSR preallocation)
+			int nnz = 0;
+			for(int i=0, aix=0; i<rlen; i++, aix+=clen)
+				nnz += (b[i] != 0) ? UtilFunctions
+					.countNonZeros(a, aix, clen) : 0;
+			//allocate and compute output in CSR format
+			int[] rptr = new int[rlen+1];
+			int[] indexes = new int[nnz];
+			double[] vals = new double[nnz];
+			rptr[0] = 0;
+			for( int i=0, aix=0, pos=0; i<rlen; i++, aix+=clen ) {
+				double bval = b[i];
+				if( bval != 0 ) {
+					for( int j=0; j<clen; j++ ) {
+						indexes[pos] = j;
+						vals[pos] = a[aix+j] * bval;
+						pos++;
+					}
+				}
+				rptr[i+1] = pos;
+			}
+			ret.sparseBlock = new SparseBlockCSR(
+				rptr, indexes, vals, nnz);
+			ret.setNonZeros(nnz);
+		}
+	}
+	
 	private static void safeBinaryMVGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) {
 		boolean isMultiply = (op.fn instanceof Multiply);
 		boolean skipEmpty = (isMultiply);
@@ -486,22 +529,21 @@ public class LibMatrixBincell
 			//if the right hand side row vector is sparse we have to exploit that;
 			//otherwise, both sparse access (binary search) and asymtotic behavior
 			//in the number of cells become major bottlenecks
-			if( m2.sparse && isMultiply ) //SPARSE *
+			if( m2.sparse && ret.sparse && isMultiply ) //SPARSE *
 			{
 				//note: sparse block guaranteed to be allocated (otherwise early about)
 				SparseBlock b = m2.sparseBlock;
+				SparseBlock c = ret.sparseBlock;
 				if( b.isEmpty(0) ) return; 
 				int blen = b.size(0); //always pos 0
 				int[] bix = b.indexes(0);
 				double[] bvals = b.values(0);
 				for( int i=0; i<rlen; i++ ) {
-					//for each row iterate only over non-zeros elements in rhs
-					for( int j=0; j<blen; j++ ) {
-						double v1 = m1.quickGetValue(i, bix[j]);
-						double v = op.fn.execute( v1, bvals[j] );
-						ret.appendValue(i, bix[j], v);
-					}
+					c.allocate(i, blen);
+					for( int j=0; j<blen; j++ )
+						c.append(i, bix[j], m1.quickGetValue(i, bix[j]) * bvals[j]);
 				}
+				ret.setNonZeros(c.size());
 			}
 			else //GENERAL CASE
 			{