You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/05/26 04:02:28 UTC

incubator-systemml git commit: [SYSTEMML-1592] Improved codegen cellwise operations w/ sparse outputs

Repository: incubator-systemml
Updated Branches:
  refs/heads/master c697c30eb -> ab1c70508


[SYSTEMML-1592] Improved codegen cellwise operations w/ sparse outputs

This patch improves codegen cellwise operations with the ability to
directly create sparse outputs instead of dense intermediates and a
subsequent conversion to sparse. We apply this for sparse inputs with
sparse-safe cellwise operations. Furthermore, this also includes a
refactoring of the cellwise operator skeleton for better instruction
locality.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ab1c7050
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ab1c7050
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ab1c7050

Branch: refs/heads/master
Commit: ab1c7050890bcfc7032c33df3c3d8244a906c8a3
Parents: c697c30
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu May 25 20:40:52 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu May 25 20:40:52 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/SpoofCellwise.java    | 607 ++++++++++---------
 1 file changed, 335 insertions(+), 272 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ab1c7050/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index b01914a..fe23a04 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -188,28 +188,28 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 			k = 1; //serial execution
 		}
 		
-		//result allocation and preparations
-		out.reset(inputs.get(0).getNumRows(), _type == CellType.NO_AGG ? 
-				inputs.get(0).getNumColumns() : 1, false);
-		out.allocateDenseBlock();
-		double[] c = out.getDenseBlock();
-		
 		//input preparation
 		double[][] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = inputs.get(0).getNumRows();
-		final int n = inputs.get(0).getNumColumns();		
+		final int n = inputs.get(0).getNumColumns();
 		
 		//sparse safe check 
-		boolean sparseSafe = isSparseSafe() || (b.length == 0 
+		boolean sparseSafe = isSparseSafe() || (b.length == 0
 				&& genexec( 0, b, scalars, m, n, 0, 0 ) == 0);
 		
+		//result allocation and preparations
+		boolean sparseOut = sparseSafe && inputs.get(0).isInSparseFormat() && _type == CellType.NO_AGG;
+		out.reset(inputs.get(0).getNumRows(), _type == CellType.NO_AGG ?
+				inputs.get(0).getNumColumns() : 1, sparseOut);
+		out.allocateDenseOrSparseBlock();
+		
 		long lnnz = 0;
 		if( k <= 1 ) //SINGLE-THREADED
 		{
 			lnnz = (!inputs.get(0).isInSparseFormat()) ?
-				executeDense(inputs.get(0).getDenseBlock(), b, scalars, c, m, n, sparseSafe, 0, m) :
-				executeSparse(inputs.get(0).getSparseBlock(), b, scalars, c, m, n, sparseSafe, 0, m);
+				executeDense(inputs.get(0).getDenseBlock(), b, scalars, out, m, n, sparseSafe, 0, m) :
+				executeSparse(inputs.get(0).getSparseBlock(), b, scalars, out, m, n, sparseSafe, 0, m);
 		}
 		else  //MULTI-THREADED
 		{
@@ -219,7 +219,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 				int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
 				int blklen = (int)(Math.ceil((double)m/nk));
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(inputs.get(0), b, scalars, c, 
+					tasks.add(new ParExecTask(inputs.get(0), b, scalars, out, 
 						m, n, sparseSafe, i*blklen, Math.min((i+1)*blklen, m))); 
 				//execute tasks
 				List<Future<Long>> taskret = pool.invokeAll(tasks);	
@@ -235,129 +235,56 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		}
 		
 		//post-processing
-		out.setNonZeros(lnnz);	
-		out.examSparsity();	
+		out.setNonZeros(lnnz);
+		out.examSparsity();
 	}
 	
-	private double executeDenseAndAgg(double[] a, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) throws DMLRuntimeException 
+	private long executeDense(double[] a, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
 	{
-		ValueFunction vfun = getAggFunction();
-		double ret = 0;
+		double[] c = out.getDenseBlock();
 		
-		//numerically stable aggregation for sum/sum_sq
-		if( vfun instanceof KahanFunction ) {
-			KahanObject kbuff = new KahanObject(0, 0);
-			KahanFunction kplus = (KahanFunction) vfun;
-			
-			if( a == null && !sparseSafe ) { //empty
-				for( int i=rl; i<ru; i++ ) 
-					for( int j=0; j<n; j++ )
-						kplus.execute2(kbuff, genexec( 0, b, scalars, m, n, i, j ));
-			}
-			else if( a != null ) { //general case
-				for( int i=rl, ix=rl*n; i<ru; i++ ) 
-					for( int j=0; j<n; j++, ix++ )
-						if( a[ix] != 0 || !sparseSafe)
-							kplus.execute2(kbuff, genexec( a[ix], b, scalars, m, n, i, j ));
-			}
-			ret = kbuff._sum;
+		if( _type == CellType.NO_AGG ) {
+			return executeDenseNoAgg(a, b, scalars, c, m, n, sparseSafe, rl, ru);
 		}
-		//safe aggregation for min/max w/ handling of zero entries
-		//note: sparse safe with zero value as min/max handled outside
-		else {
-			ret = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE; 
-			if( a == null && !sparseSafe ) { //empty
-				for( int i=rl; i<ru; i++ ) 
-					for( int j=0; j<n; j++ )
-						ret = vfun.execute(ret, genexec( 0, b, scalars, m, n, i, j ));
-			}
-			else if( a != null ) { //general case
-				for( int i=rl, ix=rl*n; i<ru; i++ ) 
-					for( int j=0; j<n; j++, ix++ )
-						if( a[ix] != 0 || !sparseSafe)
-							ret = vfun.execute(ret, genexec( a[ix], b, scalars, m, n, i, j ));
-			}
+		else if( _type == CellType.ROW_AGG ) {
+			if( _aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ )
+				return executeDenseRowAggSum(a, b, scalars, c, m, n, sparseSafe, rl, ru);
+			else
+				return executeDenseRowAggMxx(a, b, scalars, c, m, n, sparseSafe, rl, ru);
 		}
-		
-		return ret;
+		return -1;
 	}
 	
-	private long executeDense(double[] a, double[][] b, double[] scalars, double[] c, int m, int n, boolean sparseSafe, int rl, int ru) 
+	private double executeDenseAndAgg(double[] a, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) throws DMLRuntimeException 
+	{
+		//numerically stable aggregation for sum/sum_sq
+		if( _aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ )
+			return executeDenseAggSum(a, b, scalars, m, n, sparseSafe, rl, ru);
+		else
+			return executeDenseAggMxx(a, b, scalars, m, n, sparseSafe, rl, ru);
+	}
+	
+	private long executeSparse(SparseBlock sblock, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
 		throws DMLRuntimeException 
 	{
-		long lnnz = 0;
+		if( sparseSafe && sblock == null )
+			return 0;
 		
-		if( _type == CellType.NO_AGG )
-		{
-			if( a == null && !sparseSafe ) { //empty
-				//note: we can't determine sparse-safeness by executing the operator once 
-				//as the output might change with different row indices
-				for( int i=rl, ix=rl*n; i<ru; i++ ) 
-					for( int j=0; j<n; j++, ix++ ) {
-						c[ix] = genexec( 0, b, scalars, m, n, i, j ); 
-						lnnz += (c[ix]!=0) ? 1 : 0;
-					}
-			}
-			else if( a != null ) { //general case
-				for( int i=rl, ix=rl*n; i<ru; i++ ) 
-					for( int j=0; j<n; j++, ix++ ) 
-						if( a[ix] != 0 || !sparseSafe) {
-							c[ix] = genexec( a[ix], b, scalars, m, n, i, j); 
-							lnnz += (c[ix]!=0) ? 1 : 0;
-						}
-			}
+		if( _type == CellType.NO_AGG ) {
+			if( out.isInSparseFormat() ) 
+				return executeSparseNoAggSparse(sblock, b, scalars, out, m, n, sparseSafe, rl, ru);
+			else
+				return executeSparseNoAggDense(sblock, b, scalars, out, m, n, sparseSafe, rl, ru);
 		}
-		else if( _type == CellType.ROW_AGG )
-		{
-			ValueFunction vfun = getAggFunction();
-			
-			if( vfun instanceof KahanFunction ) {
-				KahanObject kbuff = new KahanObject(0, 0);
-				KahanFunction kplus = (KahanFunction) vfun;
-	
-				if( a == null && !sparseSafe ) { //empty
-					for( int i=rl; i<ru; i++ ) { 
-						kbuff.set(0, 0);
-						for( int j=0; j<n; j++ )
-							kplus.execute2(kbuff, genexec( 0, b, scalars, m, n, i, j ));
-						lnnz += ((c[i] = kbuff._sum)!=0) ? 1 : 0;
-					}
-				}
-				else if( a != null ) { //general case
-					for( int i=rl, ix=rl*n; i<ru; i++ ) {
-						kbuff.set(0, 0);
-						for( int j=0; j<n; j++, ix++ )
-							if( a[ix] != 0 || !sparseSafe)
-								kplus.execute2(kbuff, genexec( a[ix], b, scalars, m, n, i, j ));
-						lnnz += ((c[i] = kbuff._sum)!=0) ? 1 : 0;
-					}
-				}
-			}
-			else {
-				double initialVal = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE;
-				if( a == null && !sparseSafe ) { //empty
-					for( int i=rl; i<ru; i++ ) { 
-						double tmp = initialVal;
-						for( int j=0; j<n; j++ )
-							tmp = vfun.execute(tmp, genexec( 0, b, scalars, m, n, i, j ));
-						lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
-					}
-				}
-				else if( a != null ) { //general case
-					for( int i=rl, ix=rl*n; i<ru; i++ ) {
-						double tmp = initialVal;
-						for( int j=0; j<n; j++, ix++ )
-							if( a[ix] != 0 || !sparseSafe)
-								tmp = vfun.execute(tmp, genexec( a[ix], b, scalars, m, n, i, j ));
-						if( sparseSafe && UtilFunctions.containsZero(a, ix-n, n) )
-							tmp = vfun.execute(tmp, 0);
-						lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
-					}
-				}
-			}
+		else if( _type == CellType.ROW_AGG ) {
+			if( _aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ )
+				return executeSparseRowAggSum(sblock, b, scalars, out, m, n, sparseSafe, rl, ru);
+			else
+				return executeSparseRowAggMxx(sblock, b, scalars, out, m, n, sparseSafe, rl, ru);
 		}
 		
-		return lnnz;
+		return -1;
 	}
 	
 	private double executeSparseAndAgg(SparseBlock sblock, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) 
@@ -366,184 +293,320 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		if( sparseSafe && sblock == null )
 			return 0;
 		
-		ValueFunction vfun = getAggFunction();
-		double ret = 0;
+		if( _aggOp == AggOp.SUM || _aggOp == AggOp.SUM_SQ )
+			return executeSparseAggSum(sblock, b, scalars, m, n, sparseSafe, rl, ru);
+		else
+			return executeSparseAggMxx(sblock, b, scalars, m, n, sparseSafe, rl, ru);
+	}
+	
+	private long executeSparseNoAggSparse(SparseBlock sblock, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		SparseBlock c = out.getSparseBlock();
+		long lnnz = 0;
+		for(int i=rl; i<ru; i++) {
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				c.allocate(i, sparseSafe ? alen : n);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
+					if( !sparseSafe )
+						for(int j=lastj+1; j<aix[k]; j++)
+							c.append(i, j, genexec(0, b, scalars, m, n, i, j));
+					//process current non-zero
+					lastj = aix[k];
+					c.append(i, lastj, genexec(avals[k], b, scalars, m, n, i, lastj));
+				}
+			}
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					c.append(i, j, genexec(0, b, scalars, m, n, i, j));
+			lnnz += c.size(i);
+		}
 		
-		//numerically stable aggregation for sum/sum_sq
-		if( vfun instanceof KahanFunction ) {
-			KahanObject kbuff = new KahanObject(0, 0);
-			KahanFunction kplus = (KahanFunction) vfun;
+		return lnnz;
+	}
 	
-			//note: sequential scan algorithm for both sparse-safe and -unsafe 
-			//in order to avoid binary search for sparse-unsafe 
-			for(int i=rl; i<ru; i++) {
-				int lastj = -1;
-				//handle non-empty rows
-				if( sblock != null && !sblock.isEmpty(i) ) {
-					int apos = sblock.pos(i);
-					int alen = sblock.size(i);
-					int[] aix = sblock.indexes(i);
-					double[] avals = sblock.values(i);
-					for(int k=apos; k<apos+alen; k++) {
-						//process zeros before current non-zero
-						if( !sparseSafe )
-							for(int j=lastj+1; j<aix[k]; j++)
-								kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
-						//process current non-zero
-						lastj = aix[k];
-						kplus.execute2(kbuff, genexec(avals[k], b, scalars, m, n, i, lastj));
-					}
+	private long executeSparseNoAggDense(SparseBlock sblock, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		double[] c = out.getDenseBlock();
+		long lnnz = 0;
+		for(int i=rl, cix=rl*n; i<ru; i++, cix+=n) {
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
+					if( !sparseSafe )
+						for(int j=lastj+1; j<aix[k]; j++)
+							lnnz += ((c[cix+j]=genexec(0, b, scalars, m, n, i, j))!=0)?1:0;
+					//process current non-zero
+					lastj = aix[k];
+					lnnz += ((c[cix+lastj]=genexec(avals[k], b, scalars, m, n, i, lastj))!=0)?1:0;
 				}
-				//process empty rows or remaining zeros
-				if( !sparseSafe )
-					for(int j=lastj+1; j<n; j++)
-						kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
 			}
-			ret = kbuff._sum;
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					lnnz += ((c[cix+j]=genexec(0, b, scalars, m, n, i, j))!=0)?1:0;
 		}
-		//safe aggregation for min/max w/ handling of zero entries
-		//note: sparse safe with zero value as min/max handled outside
-		else {
-			ret = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE; 
-			ret = (sparseSafe && sblock.size() < (long)m*n) ? 0 : ret;
-			
-			//note: sequential scan algorithm for both sparse-safe and -unsafe 
-			//in order to avoid binary search for sparse-unsafe 
-			for(int i=rl; i<ru; i++) {
-				int lastj = -1;
-				//handle non-empty rows
-				if( sblock != null && !sblock.isEmpty(i) ) {
-					int apos = sblock.pos(i);
-					int alen = sblock.size(i);
-					int[] aix = sblock.indexes(i);
-					double[] avals = sblock.values(i);
-					for(int k=apos; k<apos+alen; k++) {
-						//process zeros before current non-zero
-						if( !sparseSafe )
-							for(int j=lastj+1; j<aix[k]; j++)
-								ret = vfun.execute(ret, genexec(0, b, scalars, m, n, i, j));
-						//process current non-zero
-						lastj = aix[k];
-						ret = vfun.execute(ret, genexec(avals[k], b, scalars, m, n, i, lastj));
-					}
+		return lnnz;
+	}
+	
+	private long executeSparseRowAggSum(SparseBlock sblock, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		KahanFunction kplus = (KahanFunction) getAggFunction();
+		KahanObject kbuff = new KahanObject(0, 0);
+
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		double[] c = out.getDenseBlock();
+		long lnnz = 0;
+		for(int i=rl; i<ru; i++) {
+			kbuff.set(0, 0);
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
+					if( !sparseSafe )
+						for(int j=lastj+1; j<aix[k]; j++)
+							kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
+					//process current non-zero
+					lastj = aix[k];
+					kplus.execute2(kbuff, genexec(avals[k], b, scalars, m, n, i, lastj));
 				}
-				//process empty rows or remaining zeros
-				if( !sparseSafe )
-					for(int j=lastj+1; j<n; j++)
-						ret = vfun.execute(ret, genexec(0, b, scalars, m, n, i, j));
 			}
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
+			lnnz += ((c[i] = kbuff._sum)!=0) ? 1 : 0;
 		}
-		
-		return ret;
+		return lnnz;
 	}
 	
-	private long executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int m, int n, boolean sparseSafe, int rl, int ru) 
+	private long executeSparseRowAggMxx(SparseBlock sblock, double[][] b, double[] scalars, MatrixBlock out, int m, int n, boolean sparseSafe, int rl, int ru) 
 		throws DMLRuntimeException 
 	{
-		if( sparseSafe && sblock == null )
-			return 0;
+		double initialVal = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE;
+		ValueFunction vfun = getAggFunction();
 		
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		double[] c = out.getDenseBlock();
 		long lnnz = 0;
-		if( _type == CellType.NO_AGG )
-		{
-			//note: sequential scan algorithm for both sparse-safe and -unsafe 
-			//in order to avoid binary search for sparse-unsafe 
-			for(int i=rl, cix=rl*n; i<ru; i++, cix+=n) {
-				int lastj = -1;
-				//handle non-empty rows
-				if( sblock != null && !sblock.isEmpty(i) ) {
-					int apos = sblock.pos(i);
-					int alen = sblock.size(i);
-					int[] aix = sblock.indexes(i);
-					double[] avals = sblock.values(i);
-					for(int k=apos; k<apos+alen; k++) {
-						//process zeros before current non-zero
-						if( !sparseSafe )
-							for(int j=lastj+1; j<aix[k]; j++)
-								lnnz += ((c[cix+j]=genexec(0, b, scalars, m, n, i, j))!=0)?1:0;
-						//process current non-zero
-						lastj = aix[k];
-						lnnz += ((c[cix+lastj]=genexec(avals[k], b, scalars, m, n, i, lastj))!=0)?1:0;
-					}
+		for(int i=rl; i<ru; i++) {
+			double tmp = (sparseSafe && sblock.size(i) < n) ? 0 : initialVal;
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
+					if( !sparseSafe )
+						for(int j=lastj+1; j<aix[k]; j++)
+							tmp = vfun.execute(tmp, genexec(0, b, scalars, m, n, i, j));
+					//process current non-zero
+					lastj = aix[k];
+					tmp = vfun.execute( tmp, genexec(avals[k], b, scalars, m, n, i, lastj));
 				}
-				//process empty rows or remaining zeros
-				if( !sparseSafe )
-					for(int j=lastj+1; j<n; j++)
-						lnnz += ((c[cix+j]=genexec(0, b, scalars, m, n, i, j))!=0)?1:0;
 			}
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					tmp = vfun.execute(tmp, genexec(0, b, scalars, m, n, i, j));
+			lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
 		}
-		else if( _type == CellType.ROW_AGG ) 
-		{
-			ValueFunction vfun = getAggFunction();
-
-			if( vfun instanceof KahanFunction ) {
-				KahanObject kbuff = new KahanObject(0, 0);
-				KahanFunction kplus = (KahanFunction) vfun;
+		return lnnz;
+	}
+	
+	private double executeSparseAggSum(SparseBlock sblock, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		KahanFunction kplus = (KahanFunction) getAggFunction();
+		KahanObject kbuff = new KahanObject(0, 0);
 
-				//note: sequential scan algorithm for both sparse-safe and -unsafe 
-				//in order to avoid binary search for sparse-unsafe 
-				for(int i=rl; i<ru; i++) {
-					kbuff.set(0, 0);
-					int lastj = -1;
-					//handle non-empty rows
-					if( sblock != null && !sblock.isEmpty(i) ) {
-						int apos = sblock.pos(i);
-						int alen = sblock.size(i);
-						int[] aix = sblock.indexes(i);
-						double[] avals = sblock.values(i);
-						for(int k=apos; k<apos+alen; k++) {
-							//process zeros before current non-zero
-							if( !sparseSafe )
-								for(int j=lastj+1; j<aix[k]; j++)
-									kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
-							//process current non-zero
-							lastj = aix[k];
-							kplus.execute2(kbuff, genexec(avals[k], b, scalars, m, n, i, lastj));
-						}
-					}
-					//process empty rows or remaining zeros
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		for(int i=rl; i<ru; i++) {
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
 					if( !sparseSafe )
-						for(int j=lastj+1; j<n; j++)
+						for(int j=lastj+1; j<aix[k]; j++)
 							kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
-					lnnz += ((c[i] = kbuff._sum)!=0) ? 1 : 0;
+					//process current non-zero
+					lastj = aix[k];
+					kplus.execute2(kbuff, genexec(avals[k], b, scalars, m, n, i, lastj));
 				}
 			}
-			else {
-				double initialVal = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE;
-				
-				//note: sequential scan algorithm for both sparse-safe and -unsafe 
-				//in order to avoid binary search for sparse-unsafe 
-				for(int i=rl; i<ru; i++) {
-					double tmp = (sparseSafe && sblock.size(i) < n) ? 0 : initialVal;
-					int lastj = -1;
-					//handle non-empty rows
-					if( sblock != null && !sblock.isEmpty(i) ) {
-						int apos = sblock.pos(i);
-						int alen = sblock.size(i);
-						int[] aix = sblock.indexes(i);
-						double[] avals = sblock.values(i);
-						for(int k=apos; k<apos+alen; k++) {
-							//process zeros before current non-zero
-							if( !sparseSafe )
-								for(int j=lastj+1; j<aix[k]; j++)
-									tmp = vfun.execute(tmp, genexec(0, b, scalars, m, n, i, j));
-							//process current non-zero
-							lastj = aix[k];
-							tmp = vfun.execute( tmp, genexec(avals[k], b, scalars, m, n, i, lastj));
-						}
-					}
-					//process empty rows or remaining zeros
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					kplus.execute2(kbuff, genexec(0, b, scalars, m, n, i, j));
+		}
+		return kbuff._sum;
+	}
+	
+	private double executeSparseAggMxx(SparseBlock sblock, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		double ret = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE;
+		ret = (sparseSafe && sblock.size() < (long)m*n) ? 0 : ret;
+		ValueFunction vfun = getAggFunction();
+		
+		//note: sequential scan algorithm for both sparse-safe and -unsafe 
+		//in order to avoid binary search for sparse-unsafe 
+		for(int i=rl; i<ru; i++) {
+			int lastj = -1;
+			//handle non-empty rows
+			if( sblock != null && !sblock.isEmpty(i) ) {
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				int[] aix = sblock.indexes(i);
+				double[] avals = sblock.values(i);
+				for(int k=apos; k<apos+alen; k++) {
+					//process zeros before current non-zero
 					if( !sparseSafe )
-						for(int j=lastj+1; j<n; j++)
-							tmp = vfun.execute(tmp, genexec(0, b, scalars, m, n, i, j));
-					lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
+						for(int j=lastj+1; j<aix[k]; j++)
+							ret = vfun.execute(ret, genexec(0, b, scalars, m, n, i, j));
+					//process current non-zero
+					lastj = aix[k];
+					ret = vfun.execute(ret, genexec(avals[k], b, scalars, m, n, i, lastj));
 				}
 			}
+			//process empty rows or remaining zeros
+			if( !sparseSafe )
+				for(int j=lastj+1; j<n; j++)
+					ret = vfun.execute(ret, genexec(0, b, scalars, m, n, i, j));
 		}
-		
+		return ret;
+	}
+	
+	private long executeDenseNoAgg(double[] a, double[][] b, double[] scalars, double[] c, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		long lnnz = 0;
+		for( int i=rl, ix=rl*n; i<ru; i++ ) 
+			for( int j=0; j<n; j++, ix++ ) {
+				double aval = (a != null) ? a[ix] : 0;
+				if( aval != 0 || !sparseSafe) {
+					c[ix] = genexec( aval, b, scalars, m, n, i, j); 
+					lnnz += (c[ix]!=0) ? 1 : 0;
+				}
+			}
 		return lnnz;
 	}
-
+	
+	private long executeDenseRowAggSum(double[] a, double[][] b, double[] scalars, double[] c, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		KahanFunction kplus = (KahanFunction) getAggFunction();
+		KahanObject kbuff = new KahanObject(0, 0);
+		long lnnz = 0;
+		for( int i=rl, ix=rl*n; i<ru; i++ ) {
+			kbuff.set(0, 0);
+			for( int j=0; j<n; j++, ix++ ) {
+				double aval = (a != null) ? a[ix] : 0;
+				if( aval != 0 || !sparseSafe)
+					kplus.execute2(kbuff, genexec(aval, b, scalars, m, n, i, j));
+			}
+			lnnz += ((c[i] = kbuff._sum)!=0) ? 1 : 0;
+		}
+		return lnnz;
+	}
+	
+	private long executeDenseRowAggMxx(double[] a, double[][] b, double[] scalars, double[] c, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		double initialVal = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE;
+		ValueFunction vfun = getAggFunction();
+		long lnnz = 0;
+		if( a == null && !sparseSafe ) { //empty
+			for( int i=rl; i<ru; i++ ) { 
+				double tmp = initialVal;
+				for( int j=0; j<n; j++ )
+					tmp = vfun.execute(tmp, genexec( 0, b, scalars, m, n, i, j ));
+				lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
+			}
+		}
+		else if( a != null ) { //general case
+			for( int i=rl, ix=rl*n; i<ru; i++ ) {
+				double tmp = initialVal;
+				for( int j=0; j<n; j++, ix++ )
+					if( a[ix] != 0 || !sparseSafe)
+						tmp = vfun.execute(tmp, genexec( a[ix], b, scalars, m, n, i, j ));
+				if( sparseSafe && UtilFunctions.containsZero(a, ix-n, n) )
+					tmp = vfun.execute(tmp, 0);
+				lnnz += ((c[i] = tmp)!=0) ? 1 : 0;
+			}
+		}
+		return lnnz;
+	}
+	
+	private double executeDenseAggSum(double[] a, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		KahanFunction kplus = (KahanFunction) getAggFunction();
+		KahanObject kbuff = new KahanObject(0, 0);
+		
+		for( int i=rl, ix=rl*n; i<ru; i++ ) 
+			for( int j=0; j<n; j++, ix++ ) {
+				double aval = (a != null) ? a[ix] : 0;
+				if( aval != 0 || !sparseSafe)
+					kplus.execute2(kbuff, genexec(aval, b, scalars, m, n, i, j));
+			}
+		return kbuff._sum;
+	}
+	
+	private double executeDenseAggMxx(double[] a, double[][] b, double[] scalars, int m, int n, boolean sparseSafe, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
+		//safe aggregation for min/max w/ handling of zero entries
+		//note: sparse safe with zero value as min/max handled outside
+		double ret = (_aggOp==AggOp.MIN) ? Double.MAX_VALUE : -Double.MAX_VALUE; 
+		ValueFunction vfun = getAggFunction();
+		
+		for( int i=rl, ix=rl*n; i<ru; i++ ) 
+			for( int j=0; j<n; j++, ix++ ) {
+				double aval = (a != null) ? a[ix] : 0;
+				if( aval != 0 || !sparseSafe)
+					ret = vfun.execute(ret, genexec(aval, b, scalars, m, n, i, j));
+			}
+		return ret;
+	}
+	
+	
 	protected abstract double genexec( double a, double[][] b, double[] scalars, int m, int n, int rowIndex, int colIndex);
 	
 	private class ParAggTask implements Callable<Double> 
@@ -582,14 +645,14 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		private final MatrixBlock _a;
 		private final double[][] _b;
 		private final double[] _scalars;
-		private final double[] _c;
+		private final MatrixBlock _c;
 		private final int _rlen;
 		private final int _clen;
 		private final boolean _safe;
 		private final int _rl;
 		private final int _ru;
 
-		protected ParExecTask( MatrixBlock a, double[][] b, double[] scalars, double[] c, 
+		protected ParExecTask( MatrixBlock a, double[][] b, double[] scalars, MatrixBlock c, 
 				int rlen, int clen, boolean sparseSafe, int rl, int ru ) {
 			_a = a;
 			_b = b;