You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by re...@apache.org on 2017/11/03 18:02:03 UTC

[01/50] [abbrv] systemml git commit: [MINOR] Refactoring lib matrixmult/bincell (instruction footprint) [Forced Update!]

Repository: systemml
Updated Branches:
  refs/heads/master eb15c5198 -> 0d4672207 (forced update)


[MINOR] Refactoring lib matrixmult/bincell (instruction footprint)

This patch makes a minor refactoring of the libraries for matrix
multiplications and binary cell-wise operations in order to reduce the
instruction footprint and simplify JIT compilation. Specifically, the
methods for dense-dense mm, sparse-dense mm, and safe binary operations
have been split into methods for the major individual cases.

On an end-to-end cnn application, this patch reduced the number of
L1-icache misses from 6,055,257,066 to 5,663,272,573 and the number of
iTLB misses from 289,601,812 to 161,268,707.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a347af3b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a347af3b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a347af3b

Branch: refs/heads/master
Commit: a347af3b7b488ac3b296b9b9692f7172d60ac6f5
Parents: 17c5d5a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:22:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:22:55 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 446 +++++++------
 .../runtime/matrix/data/LibMatrixMult.java      | 665 ++++++++++---------
 2 files changed, 580 insertions(+), 531 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 7622137..2878b3b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -210,12 +210,9 @@ public class LibMatrixBincell
 		{
 			return;
 		}
-	
-		int rlen = m1.rlen;
-		int clen = m1.clen;
-		BinaryAccessType atype = getBinaryAccessType(m1, m2);
 		
-		if(    atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
+		BinaryAccessType atype = getBinaryAccessType(m1, m2);
+		if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
 			|| atype == BinaryAccessType.MATRIX_ROW_VECTOR)  
 		{
 			//note: m2 vector and hence always dense
@@ -232,213 +229,24 @@ public class LibMatrixBincell
 		}
 		else //MATRIX - MATRIX
 		{
-			if(m1.sparse && m2.sparse)
-			{
-				if(ret.sparse)
-					ret.allocateSparseRowsBlock();	
-				
-				//both sparse blocks existing
-				if(m1.sparseBlock!=null && m2.sparseBlock!=null)
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					if( ret.sparse && lsblock.isAligned(rsblock) )
-					{
-						SparseBlock c = ret.sparseBlock;
-						for(int r=0; r<rlen; r++) 
-							if( !lsblock.isEmpty(r) ) {
-								int alen = lsblock.size(r);
-								int apos = lsblock.pos(r);
-								int[] aix = lsblock.indexes(r);
-								double[] avals = lsblock.values(r);
-								double[] bvals = rsblock.values(r);
-								c.allocate(r, alen);
-								for( int j=apos; j<apos+alen; j++ ) {
-									double tmp = op.fn.execute(avals[j], bvals[j]);
-									c.append(r, aix[j], tmp);
-								}
-								ret.nonZeros += c.size(r);
-							}
-					}
-					else //general case
-					{	
-						for(int r=0; r<rlen; r++)
-						{
-							if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
-								mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
-										rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);	
-							}
-							else if( !rsblock.isEmpty(r) ) {
-								appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-										rsblock.pos(r), rsblock.size(r), 0, r, ret);
-							}
-							else if( !lsblock.isEmpty(r) ){
-								appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-										lsblock.pos(r), lsblock.size(r), 0, r, ret);
-							}
-							// do nothing if both not existing
-						}
-					}
-				}
-				//right sparse block existing
-				else if( m2.sparseBlock!=null )
-				{
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++)
-						if( !rsblock.isEmpty(r) )
-						{
-							appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-									rsblock.pos(r), rsblock.size(r), 0, r, ret);
-						}
-				}
-				//left sparse block existing
-				else
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					
-					for(int r=0; r<rlen; r++)
-						if( !lsblock.isEmpty(r) )
-						{
-							appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-									lsblock.pos(r), lsblock.size(r), 0, r, ret);
-						}
-				}
+			if(m1.sparse && m2.sparse) {
+				safeBinaryMMSparseSparse(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && (m1.sparse || m2.sparse) &&
-					(op.fn instanceof Plus || op.fn instanceof Minus ||
-					op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
-					(op.fn instanceof Multiply && !m2.sparse )))
-			{
-				//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] c = ret.denseBlock;
-				
-				//1) process left input: assignment
-				
-				if( m1.sparse ) //SPARSE left
-				{
-					if( m1.sparseBlock != null )
-					{
-						SparseBlock a = m1.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) )
-							{
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = avals[k];
-							}
-						}
-					}
-				}
-				else //DENSE left
-				{
-					if( !m1.isEmptyBlock(false) ) 
-						System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
-					else
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//2) process right input: op.fn (+,-,*), * only if dense
-				long lnnz = 0;
-				if( m2.sparse ) //SPARSE right
-				{				
-					if(m2.sparseBlock!=null)
-					{
-						SparseBlock a = m2.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) ) {
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
-							}
-							//exploit temporal locality of rows
-							lnnz += ret.recomputeNonZeros(i, i, 0, clen-1);
-						}
-					}
-				}
-				else //DENSE right
-				{
-					if( !m2.isEmptyBlock(false) ) {
-						double[] a = m2.denseBlock;
-						for( int i=0; i<m*n; i++ ) {
-							c[i] = op.fn.execute(c[i], a[i]);
-							lnnz += (c[i]!=0) ? 1 : 0;
-						}
-					}
-					else if(op.fn instanceof Multiply)
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//3) recompute nnz
-				ret.setNonZeros(lnnz);
+				(op.fn instanceof Plus || op.fn instanceof Minus ||
+				op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
+				(op.fn instanceof Multiply && !m2.sparse ))) {
+				safeBinaryMMSparseDenseDense(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && !m1.sparse && !m2.sparse 
-					&& m1.denseBlock!=null && m2.denseBlock!=null )
-			{
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] a = m1.denseBlock;
-				double[] b = m2.denseBlock;
-				double[] c = ret.denseBlock;
-				ValueFunction fn = op.fn;
-				
-				//compute dense-dense binary, maintain nnz on-the-fly
-				int lnnz = 0;
-				for( int i=0; i<m*n; i++ ) {
-					c[i] = fn.execute(a[i], b[i]);
-					lnnz += (c[i]!=0)? 1 : 0;
-				}
-				ret.setNonZeros(lnnz);
+				&& m1.denseBlock!=null && m2.denseBlock!=null ) {
+				safeBinaryMMDenseDenseDense(m1, m2, ret, op);
 			}
-			else if( skipEmpty && (m1.sparse || m2.sparse) ) 
-			{
-				SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
-				if( a == null )
-					return;
-				
-				//prepare second input and allocate output
-				MatrixBlock b = m1.sparse ? m2 : m1;
-				ret.allocateBlock();
-				
-				for( int i=0; i<a.numRows(); i++ ) {
-					if( a.isEmpty(i) ) continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					if( ret.sparse && !b.sparse )
-						ret.sparseBlock.allocate(i, alen);
-					for(int k = apos; k < apos+alen; k++) {
-						double in2 = b.quickGetValue(i, aix[k]);
-						if( in2==0 ) continue;
-						double val = op.fn.execute(avals[k], in2);
-						ret.appendValue(i, aix[k], val);
-					}
-				}
+			else if( skipEmpty && (m1.sparse || m2.sparse) ) {
+				safeBinaryMMSparseDenseSkip(m1, m2, ret, op);
 			}
-			else //generic case
-			{
-				for(int r=0; r<rlen; r++)
-					for(int c=0; c<clen; c++) {
-						double in1 = m1.quickGetValue(r, c);
-						double in2 = m2.quickGetValue(r, c);
-						if( in1==0 && in2==0) continue;
-						double val = op.fn.execute(in1, in2);
-						ret.appendValue(r, c, val);
-					}
+			else { //generic case
+				safeBinaryMMGeneric(m1, m2, ret, op);
 			}
 		}
 	}
@@ -694,7 +502,7 @@ public class LibMatrixBincell
 						for( int j=0; j<blen; j++ ) {
 							double v1 = m1.quickGetValue(i, bix[j]);
 							double v = op.fn.execute( v1, bvals[j] );
-							ret.appendValue(i, bix[j], v);					
+							ret.appendValue(i, bix[j], v);
 						}
 					}
 				}
@@ -731,19 +539,231 @@ public class LibMatrixBincell
 		}
 		else {
 			for(int r=0; r<rlen; r++) {
-				double v1 = m1.quickGetValue(r, 0);		
+				double v1 = m1.quickGetValue(r, 0);
 				for(int c=0; c<clen; c++)
 				{
 					double v2 = m2.quickGetValue(0, c);
 					double v = op.fn.execute( v1, v2 );
-					ret.appendValue(r, c, v);	
+					ret.appendValue(r, c, v);
 				}
-			}	
-		}	
-			
+			}
+		}
+		
 		//no need to recomputeNonZeros since maintained in append value
 	}
 	
+	private static void safeBinaryMMSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		final int rlen = m1.rlen;
+		if(ret.sparse)
+			ret.allocateSparseRowsBlock();
+		
+		//both sparse blocks existing
+		if(m1.sparseBlock!=null && m2.sparseBlock!=null)
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			SparseBlock rsblock = m2.sparseBlock;
+			
+			if( ret.sparse && lsblock.isAligned(rsblock) )
+			{
+				SparseBlock c = ret.sparseBlock;
+				for(int r=0; r<rlen; r++) 
+					if( !lsblock.isEmpty(r) ) {
+						int alen = lsblock.size(r);
+						int apos = lsblock.pos(r);
+						int[] aix = lsblock.indexes(r);
+						double[] avals = lsblock.values(r);
+						double[] bvals = rsblock.values(r);
+						c.allocate(r, alen);
+						for( int j=apos; j<apos+alen; j++ ) {
+							double tmp = op.fn.execute(avals[j], bvals[j]);
+							c.append(r, aix[j], tmp);
+						}
+						ret.nonZeros += c.size(r);
+					}
+			}
+			else //general case
+			{
+				for(int r=0; r<rlen; r++) {
+					if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
+						mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
+							rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
+					}
+					else if( !rsblock.isEmpty(r) ) {
+						appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+							rsblock.pos(r), rsblock.size(r), 0, r, ret);
+					}
+					else if( !lsblock.isEmpty(r) ){
+						appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+							lsblock.pos(r), lsblock.size(r), 0, r, ret);
+					}
+					// do nothing if both not existing
+				}
+			}
+		}
+		//right sparse block existing
+		else if( m2.sparseBlock!=null )
+		{
+			SparseBlock rsblock = m2.sparseBlock;
+			for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++) {
+				if( rsblock.isEmpty(r) ) continue;
+				appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+					rsblock.pos(r), rsblock.size(r), 0, r, ret);
+			}
+		}
+		//left sparse block existing
+		else
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			for(int r=0; r<rlen; r++) {
+				if( lsblock.isEmpty(r) ) continue;
+				appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+					lsblock.pos(r), lsblock.size(r), 0, r, ret);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMSparseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] c = ret.denseBlock;
+		
+		//1) process left input: assignment
+		
+		if( m1.sparse ) //SPARSE left
+		{
+			if( m1.sparseBlock != null )
+			{
+				SparseBlock a = m1.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) )
+					{
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = avals[k];
+					}
+				}
+			}
+		}
+		else //DENSE left
+		{
+			if( !m1.isEmptyBlock(false) ) 
+				System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
+			else
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//2) process right input: op.fn (+,-,*), * only if dense
+		long lnnz = 0;
+		if( m2.sparse ) //SPARSE right
+		{
+			if(m2.sparseBlock!=null)
+			{
+				SparseBlock a = m2.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) ) {
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
+					}
+					//exploit temporal locality of rows
+					lnnz += ret.recomputeNonZeros(i, i, 0, n-1);
+				}
+			}
+		}
+		else //DENSE right
+		{
+			if( !m2.isEmptyBlock(false) ) {
+				double[] a = m2.denseBlock;
+				for( int i=0; i<m*n; i++ ) {
+					c[i] = op.fn.execute(c[i], a[i]);
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+			else if(op.fn instanceof Multiply)
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//3) recompute nnz
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMDenseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] a = m1.denseBlock;
+		double[] b = m2.denseBlock;
+		double[] c = ret.denseBlock;
+		ValueFunction fn = op.fn;
+		
+		//compute dense-dense binary, maintain nnz on-the-fly
+		int lnnz = 0;
+		for( int i=0; i<m*n; i++ ) {
+			c[i] = fn.execute(a[i], b[i]);
+			lnnz += (c[i]!=0)? 1 : 0;
+		}
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMSparseDenseSkip(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
+		if( a == null )
+			return;
+		
+		//prepare second input and allocate output
+		MatrixBlock b = m1.sparse ? m2 : m1;
+		ret.allocateBlock();
+		
+		for( int i=0; i<a.numRows(); i++ ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			if( ret.sparse && !b.sparse )
+				ret.sparseBlock.allocate(i, alen);
+			for(int k = apos; k < apos+alen; k++) {
+				double in2 = b.quickGetValue(i, aix[k]);
+				if( in2==0 ) continue;
+				double val = op.fn.execute(avals[k], in2);
+				ret.appendValue(i, aix[k], val);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		int rlen = m1.rlen;
+		int clen = m2.clen;
+		for(int r=0; r<rlen; r++)
+			for(int c=0; c<clen; c++) {
+				double in1 = m1.quickGetValue(r, c);
+				double in2 = m2.quickGetValue(r, c);
+				if( in1==0 && in2==0) continue;
+				double val = op.fn.execute(in1, in2);
+				ret.appendValue(r, c, val);
+			}
+	}
+	
 	/**
 	 * 
 	 * This will do cell wise operation for &lt;, &lt;=, &gt;, &gt;=, == and != operators.
@@ -1254,6 +1274,4 @@ public class LibMatrixBincell
 			result.appendValue(resultRow, cols2[j], v);
 		}
 	}
-	
 }
-

http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fee73c5..eca26f6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -632,7 +632,7 @@ public class LibMatrixMult
 		ret.allocateBlock();
 		
 		try 
-		{			
+		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
 			int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -927,169 +927,189 @@ public class LibMatrixMult
 
 	private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException
-	{			
+	{
 		double[] a = m1.denseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
 		final int n = m2.clen;
 		final int cd = m1.clen;
-
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			if( m==1 && n==1 ) 		      //DOT PRODUCT
-			{
+		
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
 				c[0] = dotProduct(a, b, cd);
 			}
-			else if( n>1 && cd == 1 )     //OUTER PRODUCT
-			{
+			else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
 				for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
 					if( a[i] == 1 )
 						System.arraycopy(b, 0, c, cix, n);
-				    else if( a[i] != 0 )
+					else if( a[i] != 0 )
 						vectMultiplyWrite(a[i], b, c, 0, cix, n);
 					else
 						Arrays.fill(c, cix, cix+n, 0);
 				}
 			}
-			else if( n==1 && cd == 1 )    //VECTOR-SCALAR
-			{
+			else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
 				vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
-					c[i] = dotProduct(a, b, aix, 0, cd);	
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = 2*1024; //16KB vector blocks (L1) 
-				for( int bi=rl; bi<ru; bi+=blocksizeI ) {
-					int bimin = Math.min(bi+blocksizeI, ru);
-					for( int bk=0; bk<cd; bk+=blocksizeK ) {
-						int bkmin = Math.min(bk+blocksizeK, cd);
-						for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
-							c[i] += dotProduct(a, b, aix, bk, bkmin-bk);	
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				//rest not aligned to blocks of 2 rows
-				final int kn = (ru-rl)%2;
-				if( kn == 1 && a[rl] != 0 )
-					vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-				
-				//compute blocks of 2 rows (2 instead of 4 for small n<64) 
-				for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
-					if( a[k] != 0 && a[k+1] != 0  )
-						vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
-					else if( a[k] != 0 )
-						vectMultiplyAdd(a[k], b, c, bix, 0, n);
-					else if( a[k+1] != 0 )	
-						vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				//cache-conscious parallelization over rows in rhs matrix
-				final int kn = (ru-rl)%4;				
-				
-				//rest not aligned to blocks of 2 rows
-				for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
-					for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
-						if( a[aix+k] != 0 )
-							vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
-				
-				final int blocksizeK = 48;
-				final int blocksizeJ = 1024;
-				
-				//blocked execution
-				for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
-					for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) 
-					{
-						//compute blocks of 4 rows in rhs w/ IKJ
-						int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-						for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
-							for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
-								vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
-										b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
-							}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
 			}
-			else if( tm2 )                //MATRIX-MATRIX (skinny rhs)
-			{
-				//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
-				//however, explicit flag required since dimension change m2
-				final int n2 = m2.rlen;
-				for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
-					for( int j=0, bix=0; j<n2; j++, bix+=cd )
-						c[cix+j] = dotProduct(a, b, aix, bix, cd);
-			}
-			else                          //MATRIX-MATRIX
-			{	
-				//1) Unrolled inner loop (for better instruction-level parallelism)
-				//2) Blocked execution (for less cache trashing in parallel exec) 	
-				//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-				
-				final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
-				final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-				final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
-
-				//temporary arrays (nnz a, b index)
-				double[] ta = new double[ blocksizeK ];
-				int[]  tbi  = new int[ blocksizeK ];
-				
-				//blocked execution
-				for( int bi = rl; bi < ru; bi+=blocksizeI )
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-						for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
-						{
-							int bklen = bkmin-bk;
-							int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-				    		for( int i = bi; i < bimin; i++) 
-				    		{
-				    			int aixi = i * cd + bk; //start index on a
-				    			int cixj = i * n + bj; //scan index on c
-				    			
-				    			//determine nnz of a (for sparsity-aware skipping of rows)
-				    			int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
-				    			//if( knnz > 0 ) //for skipping empty rows
-				    			
-			    				//rest not aligned to blocks of 4 rows
-				    			final int bn = knnz % 4;
-				    			switch( bn ){
-					    			case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
-					    	    	case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
-					    			case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
-				    			}
-				    			
-				    			//compute blocks of 4 rows (core inner loop)
-				    			for( int k = bn; k<knnz; k+=4 ){
-				    				vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
-				    						          tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-				    			}
-				    		}
-						}
+			else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
+				matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
+			}
+			else {                          //MATRIX-MATRIX
+				matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
 			}
 		}
-		else
-		{
-			double val;
+		else {
 			for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n) 
-				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
-				{			
-					val = a[ aix ];
+				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n) {
+					double val = a[ aix ];
 					if( val != 0 )
 						for( int j = 0; j < n; j++) 
 							c[ cix+j ] += val * b[ bix+j ];
-				}	
+				}
 		}
+	}
+	
+	private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
+			c[i] = dotProduct(a, b, aix, 0, cd);
+	}
+	
+	private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		final int blocksizeI = 32;
+		final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+		for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+			int bimin = Math.min(bi+blocksizeI, ru);
+			for( int bk=0; bk<cd; bk+=blocksizeK ) {
+				int bkmin = Math.min(bk+blocksizeK, cd);
+				for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
+					c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+			}
+		}
+	}
+	
+	private static void matrixMultDenseDenseVM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//parallelization over rows in rhs matrix
+		//rest not aligned to blocks of 2 rows
+		final int kn = (ru-rl)%2;
+		if( kn == 1 && a[rl] != 0 )
+			vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
 		
+		//compute blocks of 2 rows (2 instead of 4 for small n<64) 
+		for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
+			if( a[k] != 0 && a[k+1] != 0  )
+				vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
+			else if( a[k] != 0 )
+				vectMultiplyAdd(a[k], b, c, bix, 0, n);
+			else if( a[k+1] != 0 )	
+				vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+		}
+	}
+	
+	private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] b, double[] c, int m, int n, int cd, int rl, int ru)
+		throws DMLRuntimeException
+	{
+		//cache-conscious parallelization over rows in rhs matrix
+		final int kn = (ru-rl)%4;
+		
+		//rest not aligned to blocks of 2 rows
+		for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
+			for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
+				if( a[aix+k] != 0 )
+					vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
+		
+		final int blocksizeK = 48;
+		final int blocksizeJ = 1024;
+		
+		//blocked execution
+		for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
+			for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+				//compute blocks of 4 rows in rhs w/ IKJ
+				int bjlen = Math.min(n, bj+blocksizeJ)-bj;
+				for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
+					for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
+						vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
+							b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
+					}
+			}
+	}
+	
+	private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, double[] b, double[] c, int n2, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
+		//however, explicit flag required since dimension change m2
+		for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
+			for( int j=0, bix=0; j<n2; j++, bix+=cd )
+				c[cix+j] = dotProduct(a, b, aix, bix, cd);
+	}
+	
+	private static void matrixMultDenseDenseMM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru, int cl, int cu) 
+		throws DMLRuntimeException
+	{
+		//1) Unrolled inner loop (for better instruction-level parallelism)
+		//2) Blocked execution (for less cache trashing in parallel exec) 	
+		//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
+		
+		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
+		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
+		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
+
+		//temporary arrays (nnz a, b index)
+		double[] ta = new double[ blocksizeK ];
+		int[]  tbi  = new int[ blocksizeK ];
+		
+		//blocked execution
+		for( int bi = rl; bi < ru; bi+=blocksizeI )
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
+				for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
+				{
+					int bklen = bkmin-bk;
+					int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
+					
+					//core sub block matrix multiplication
+					for( int i = bi; i < bimin; i++) 
+					{
+						int aixi = i * cd + bk; //start index on a
+						int cixj = i * n + bj; //scan index on c
+						
+						//determine nnz of a (for sparsity-aware skipping of rows)
+						int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+						//if( knnz > 0 ) //for skipping empty rows
+						
+						//rest not aligned to blocks of 4 rows
+						final int bn = knnz % 4;
+						switch( bn ){
+							case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
+							case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
+							case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
+						}
+						
+						//compute blocks of 4 rows (core inner loop)
+						for( int k = bn; k<knnz; k+=4 ){
+							vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
+									tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+						}
+					}
+				}
 	}
 
 	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
@@ -1163,7 +1183,8 @@ public class LibMatrixMult
 
 	private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
 		throws DMLRuntimeException
-	{	
+	{
+		SparseBlock a = m1.sparseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
@@ -1171,179 +1192,195 @@ public class LibMatrixMult
 		final int cd = m2.rlen;
 		final long xsp = (long)m*cd/m1.nonZeros;
 
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			SparseBlock a = m1.sparseBlock;
-			
-			if( m==1 && n==1 )            //DOT PRODUCT
-			{
-				if( !a.isEmpty(0) ) {
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
+				if( !a.isEmpty(0) )
 					c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
-				}
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl; i<ru; i++ )
-					if( !a.isEmpty(i) )
-						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultSparseDenseMVShortRHS(a, b, c, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1  
-				int[] curk = new int[blocksizeI];
-				
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
-						for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
-							if( a.isEmpty(i) ) continue;
-							int apos = a.pos(i);
-							int alen = a.size(i);
-							int[] aix = a.indexes(i);
-							double[] avals = a.values(i);
-							int k = curk[i-bi] + apos;
-							for( ; k<apos+alen && aix[k]<bkmin; k++ )
-								c[i] += avals[k] * b[aix[k]];
-							curk[i-bi] = k - apos;
-						}
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				if( !a.isEmpty(0) ) 
-				{
-					int alen = a.size(0);
-					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);
-					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
-					rlix = (rlix>=0) ? rlix : alen;
-					
-					for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
-						if( k+1<alen && aix[k+1]<ru )
-							vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
-						else
-							vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
-					}
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultSparseDenseVM(a, b, c, n, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				int arlen = a.numRows();
-				for( int i=0, cix=0; i<arlen; i++, cix+=n )
-					if( !a.isEmpty(i) ) 
-					{
-						int apos = a.pos(i);
-						int alen = a.size(i);
-						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);
-						
-						int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
-						k1 = (k1>=0) ? apos+k1 : apos+alen;
-						int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
-						k2 = (k2>=0) ? apos+k2 : apos+alen;
-						
-						//rest not aligned to blocks of 4 rows
-		    			final int bn = (k2-k1) % 4;
-		    			switch( bn ){
-			    			case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
-			    	    	case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
-			    			case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
-		    			}
-		    			
-		    			//compute blocks of 4 rows (core inner loop)
-		    			for( int k = k1+bn; k<k2; k+=4 ) {
-		    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-		    						          aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-		    			}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
 			}
-			else if( n<=64 )              //MATRIX-MATRIX (skinny rhs)
-			{
-				//no blocking since b and c fit into cache anyway
-				for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
-					if( a.isEmpty(i) ) 
-						continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					//rest not aligned to blocks of 4 rows
-					int bn = alen%4;
-					for( int k=apos; k<apos+bn; k++ )
-	    				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n); 
-	    			//compute blocks of 4 rows (core inner loop)
-	    			for( int k=apos+bn; k<apos+alen; k+=4 )
-	    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-	    					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-				}	
+			else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
+				matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
 			}
-			else                          //MATRIX-MATRIX
-			{
-				//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
-				//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
-				//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
-				//output - however, in this case it's unlikely that we consume every cache line in the rhs
-				
-				final int blocksizeI = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeK = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeJ = 1024; 
-				
-				//temporary array of current sparse positions
-				int[] curk = new int[blocksizeI];
-				
-				//blocked execution over IKJ 
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
-						for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
-							int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-							for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
-								if( !a.isEmpty(i) ) {
-									int apos = a.pos(i);
-									int alen = a.size(i);
-									int[] aix = a.indexes(i);
-									double[] avals = a.values(i);
-									
-									int k = curk[i-bi] + apos;
-					    			//rest not aligned to blocks of 4 rows
-									int bn = alen%4;
-									for( ; k<apos+bn && aix[k]<bkmin; k++ )
-					    				vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
-					    			//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
-					    			for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
-					    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-					    					aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
-					    			//update positions on last bj block
-					    			if( bj+bjlen==n )
-					    				curk[i-bi] = k - apos;
-								}
-							}
-						}
-					}
+			else {                          //MATRIX-MATRIX
+				matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
+			}
+		}
+		else {
+			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue; 
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				for(int k = apos; k < apos+alen; k++) {
+					double val = avals[k];
+					for(int j = 0, bix=aix[k]*n; j < n; j++)
+						c[cix+j] += val * b[bix+j];
 				}
 			}
 		}
-		else
-		{
-			SparseBlock a = m1.sparseBlock;
-			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
+	}
+	
+	private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, double[] b, double[] c, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl; i<ru; i++ )
+			if( !a.isEmpty(i) )
+				c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+	}
+	
+	private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, double[] b, double[] c, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		final int blocksizeI = 32;
+		final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
+		int[] curk = new int[blocksizeI];
+		
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+				for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
+					int k = curk[i-bi] + apos;
+					for( ; k<apos+alen && aix[k]<bkmin; k++ )
+						c[i] += avals[k] * b[aix[k]];
+					curk[i-bi] = k - apos;
+				}
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		if( a.isEmpty(0) )
+			return;
+		
+		//parallelization over rows in rhs matrix
+		int alen = a.size(0);
+		int[] aix = a.indexes(0);
+		double[] avals = a.values(0);
+		int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
+		rlix = (rlix>=0) ? rlix : alen;
+		
+		for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
+			if( k+1<alen && aix[k+1]<ru )
+				vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
+			else
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		int arlen = a.numRows();
+		for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			
+			int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
+			k1 = (k1>=0) ? apos+k1 : apos+alen;
+			int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
+			k2 = (k2>=0) ? apos+k2 : apos+alen;
+			
+			//rest not aligned to blocks of 4 rows
+			final int bn = (k2-k1) % 4;
+			switch( bn ){
+				case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
+				case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+				case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+			}
+			
+			//compute blocks of 4 rows (core inner loop)
+			for( int k = k1+bn; k<k2; k+=4 ) {
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//no blocking since b and c fit into cache anyway
+		for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			//rest not aligned to blocks of 4 rows
+			int bn = alen%4;
+			for( int k=apos; k<apos+bn; k++ )
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
+			//compute blocks of 4 rows (core inner loop)
+			for( int k=apos+bn; k<apos+alen; k+=4 )
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+		}
+	}
+	
+	private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, double[] c, int n, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
+		//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
+		//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
+		//output - however, in this case it's unlikely that we consume every cache line in the rhs
+		final int blocksizeI = (int) (8L*xsp);
+		final int blocksizeK = (int) (8L*xsp);
+		final int blocksizeJ = 1024; 
+		
+		//temporary array of current sparse positions
+		int[] curk = new int[blocksizeI];
+		
+		//blocked execution over IKJ 
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+				for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+					int bjlen = Math.min(n, bj+blocksizeJ)-bj;
 					
-					for(int k = apos; k < apos+alen; k++) {
-						double val = avals[k];
-						for(int j = 0, bix=aix[k]*n; j < n; j++)
-							c[cix+j] += val * b[bix+j];
+					//core sub block matrix multiplication
+					for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
+						if( !a.isEmpty(i) ) {
+							int apos = a.pos(i);
+							int alen = a.size(i);
+							int[] aix = a.indexes(i);
+							double[] avals = a.values(i);
+							
+							int k = curk[i-bi] + apos;
+							//rest not aligned to blocks of 4 rows
+							int bn = alen%4;
+							for( ; k<apos+bn && aix[k]<bkmin; k++ )
+								vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
+							//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
+							for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
+								vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+									aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
+							//update positions on last bj block
+							if( bj+bjlen==n )
+								curk[i-bi] = k - apos;
+						}
 					}
 				}
 			}
@@ -1408,8 +1445,8 @@ public class LibMatrixMult
 								int[] aix = a.indexes(i);
 								double[] avals = a.values(i);	
 								
-								int k = curk[i-bi] + apos;									
-				    			for(; k < apos+alen && aix[k]<bkmin; k++) {
+								int k = curk[i-bi] + apos;
+								for(; k < apos+alen && aix[k]<bkmin; k++) {
 									if( !b.isEmpty(aix[k]) )
 										vectMultiplyAdd(avals[k], b.values(aix[k]), c, 
 											b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
@@ -1423,28 +1460,22 @@ public class LibMatrixMult
 		}
 		else
 		{
-			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
-					
-					for(int k = apos; k < apos+alen; k++) 
-					{
-						double val = avals[k];
-						if( !b.isEmpty(aix[k]) ) 
-						{
-							int bpos = b.pos(aix[k]);
-							int blen = b.size(aix[k]);
-							int[] bix = b.indexes(aix[k]);
-							double[] bvals = b.values(aix[k]);	
-							for(int j = bpos; j < bpos+blen; j++)
-								c[cix+bix[j]] += val * bvals[j];								
-						}
-					}						
+			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue;
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				
+				for(int k = apos; k < apos+alen; k++) {
+					if( b.isEmpty(aix[k]) ) continue;
+					double val = avals[k];
+					int bpos = b.pos(aix[k]);
+					int blen = b.size(aix[k]);
+					int[] bix = b.indexes(aix[k]);
+					double[] bvals = b.values(aix[k]);
+					for(int j = bpos; j < bpos+blen; j++)
+						c[cix+bix[j]] += val * bvals[j];
 				}
 			}
 		}

[34/50] [abbrv] systemml git commit: [SYSTEMML-1970] Performance conv2d backward filter (dense/sparse-sparse)

Posted by re...@apache.org.

[SYSTEMML-1970] Performance conv2d backward filter (dense/sparse-sparse)

This patch makes a number of smaller performance improvements to the
existing conv2d backward filter function. 

1) Dense/sparse-sparse: Whenever the rhs is sparse and the lhs has a
higher sparsity in terms of nnz/cells, we now flip the computation to
t(t(rhs)%*%t(lhs)), where the t(rhs) and t(lhs) are pckibacked into the
im2col and rotate calls, and the final transpose collapses with the
existing transpose add to a simple add.

2) Avoid unnecessary allocations in sparse-dense and sparse-sparse
matrix multiplications (limit temporary arrays to min(blocksize,ru-rl)).

3) Avoid class locking: So far the transAdd used class-wide
synchronization which causes unnecessary contention in scenarios where
multiple builtin functions are ran concurrently (e.g., parfor, jmlc). We
now simply synchronize over the monitor of the allocated output.

On an end-to-end cnn application w/ native blas enabled, this patch
improved performance from 349s to 335s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/dd513ffe
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/dd513ffe
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/dd513ffe

Branch: refs/heads/master
Commit: dd513ffee87a4efaf1d5f771a8d0ee4ccccbae67
Parents: 591a0f7
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Oct 26 20:26:14 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 26 23:38:19 2017 -0700

----------------------------------------------------------------------
 .../LibMatrixDNNConv2dBackwardFilterHelper.java | 90 +++++++++++++++++---
 .../runtime/matrix/data/LibMatrixDNNHelper.java |  2 +
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 27 +++---
 .../runtime/matrix/data/LibMatrixMult.java      |  4 +-
 4 files changed, 95 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/dd513ffe/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index f0fd002..9698725 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -124,21 +124,83 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 		}
 	}
 	
-	private static synchronized void inplaceTransAdd(double[] a, ConvolutionParameters params) {
-		// Perform transposed addition: output of size [K, CRS] += input of size [CRS,K]
-		double [] c = params.output.denseBlock;
-		final int CRS = params.C*params.R*params.S, K = params.K;
-		final int blocksizeIJ = 128; //L2 cache
+	public static class Conv2dBackwardFilterTrans implements Callable<Long> {
+		private final int _rl, _ru; 
+		private final ConvolutionParameters _params; 
+		
+		public Conv2dBackwardFilterTrans(int rl, int ru, ConvolutionParameters params) {
+			_rl = rl; _ru = ru;
+			_params = params;
+		}
 		
-		//cache-conscious blocked execution
-		for( int bi=0; bi<CRS; bi+=blocksizeIJ )
-			for( int bj=0; bj<K; bj+=blocksizeIJ ) {
-				int bimin = Math.min(bi+blocksizeIJ, CRS);
-				int bjmin = Math.min(bj+blocksizeIJ, K);
-				//core transpose add operation
-				for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K)
-					for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS)
-						c[cix] += a[aix+j];
+		@Override
+		public Long call() throws Exception {
+			int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
+			MatrixBlock dout = _params.input2;
+			MatrixBlock im2ColOutBlock = new MatrixBlock(PQ, CRS, false).allocateBlock();
+			MatrixBlock outRotate = new MatrixBlock(K, PQ, dout.sparse).allocateBlock();
+			MatrixBlock outMM = new MatrixBlock(K, CRS, false).allocateBlock();
+			
+			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, true);
+			Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, true);
+			double [] partRet = new double[CRS*_params.K];
+			long time1 = 0; long time2 = 0;
+			for(int n = _rl; n < _ru; n++) {
+				// rotate180(dout[n,]) => dout_reshaped
+				rotate180Worker.execute(n, 0);
+				
+				// im2col(input) => _im2ColOutBlock
+				long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+				im2ColWorker.execute(n);
+				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+				
+				outMM.reset(K, CRS, false);
+				//Timing time = new Timing(true);
+				LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, 
+					outMM, !outRotate.sparse, !im2ColOutBlock.sparse, _params);
+				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+				
+				if( !outMM.isEmptyBlock() ) //accumulate row results
+					LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS);
+				
+				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
+					time1 += t2 - t1;
+					time2 += t3 - t2;
+				}
+			}
+			//no need to transpose because t(t(out)) cancel out
+			inplaceAdd(partRet, _params);
+			if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
+				LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
+				LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
 			}
+			return 0L;
+		}
+	}
+	
+	private static void inplaceAdd(double[] a, ConvolutionParameters params) {
+		synchronized (params.output.denseBlock) {
+			LibMatrixMult.vectAdd(a, params.output.denseBlock, 0, 0, a.length);
+		}
+	}
+	
+	private static void inplaceTransAdd(double[] a, ConvolutionParameters params) {
+		synchronized (params.output.denseBlock) {
+			// Perform transposed addition: output of size [K, CRS] += input of size [CRS,K]
+			double [] c = params.output.denseBlock;
+			final int CRS = params.C*params.R*params.S, K = params.K;
+			final int blocksizeIJ = 128; //L2 cache
+			
+			//cache-conscious blocked execution
+			for( int bi=0; bi<CRS; bi+=blocksizeIJ )
+				for( int bj=0; bj<K; bj+=blocksizeIJ ) {
+					int bimin = Math.min(bi+blocksizeIJ, CRS);
+					int bjmin = Math.min(bj+blocksizeIJ, K);
+					//core transpose add operation
+					for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K)
+						for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS)
+							c[cix] += a[aix+j];
+				}
+		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/dd513ffe/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index 55f6e4c..dfd0778 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -190,6 +190,8 @@ public class LibMatrixDNNHelper {
 			//implementation simply rotates the sparse filters into dense rows
 			if( applyNative ) 
 				ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+			else if( params.input2.sparse && params.input1.getSparsity() > params.input2.getSparsity() )
+				ret.add(new Conv2dBackwardFilterTrans(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput)
 				ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else

http://git-wip-us.apache.org/repos/asf/systemml/blob/dd513ffe/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index a4a6d3d..a4b1877 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -41,10 +41,10 @@ public class LibMatrixDNNIm2ColHelper {
 				if( LOG.isTraceEnabled() ) 
 					LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform "
 						+ "im2col (stride1pad0="+stride1Pad0+", allChannels="+allChannels+").");
-				if(allChannels && stride1Pad0 )
+				if(allChannels && stride1Pad0 && !trans )
 					return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
 				else if( allChannels )
-					return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
+					return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), out.getDenseBlock(), params, trans);
 				else if( stride1Pad0 )
 					return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), out.getDenseBlock(), params);
 				else
@@ -200,7 +200,8 @@ public class LibMatrixDNNIm2ColHelper {
 		private final double[] inputArray, outputArray; 
 		private final int CRS, S, R, P, Q, CHW, H, W; 
 		private final int stride_h, stride_w, pad_h, pad_w;
-		public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
+		private final boolean trans;
+		public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params, boolean trans) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
 			this.CRS = params.C * params.R * params.S;
@@ -208,6 +209,7 @@ public class LibMatrixDNNIm2ColHelper {
 			this.CHW = params.C*params.H*params.W;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
 			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
+			this.trans = trans;
 		}
 		
 		@Override
@@ -217,23 +219,24 @@ public class LibMatrixDNNIm2ColHelper {
 
 		@Override
 		public void execute(int n) {
+			//reset for selective copy
+			Arrays.fill(outputArray, 0);
+			
 			int nOffset = n * CHW;
 			for (int c = 0; c < CRS; ++c) {
 				int wOffset = c % S;
 				int hOffset = (c / S) % R;
 				int cInput = c / R / S;
 				for (int h = 0; h < P; ++h) {
-					int outOffset = (c * P + h) * Q;
+					int outOffset = trans ? c+(h*Q*CRS) : (c*P+h)*Q;
 					int hPadded = h * stride_h - pad_h + hOffset;
 					int inputOffset = nOffset + (cInput * H + hPadded) * W;
-					if (hPadded < 0 || hPadded >= H) {
-						Arrays.fill(outputArray, outOffset, outOffset+Q, 0);
-					} else {
-						for (int w = 0; w < Q; ++w) {
-							int wPadded = w * stride_w - pad_w + wOffset;
-							boolean assign = (wPadded >= 0 && wPadded < W);
-							outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
-						}
+					if (hPadded < 0 || hPadded >= H ) continue;
+					for (int w = 0; w < Q; ++w) {
+						int wPadded = w * stride_w - pad_w + wOffset;
+						if( wPadded >= 0 && wPadded < W )
+							outputArray[outOffset + (trans?w*CRS:w)] 
+								= inputArray[inputOffset + wPadded];
 					}
 				}
 			}

http://git-wip-us.apache.org/repos/asf/systemml/blob/dd513ffe/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fa4d667..684f327 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1351,7 +1351,7 @@ public class LibMatrixMult
 		final int blocksizeJ = 1024; 
 		
 		//temporary array of current sparse positions
-		int[] curk = new int[blocksizeI];
+		int[] curk = new int[Math.min(blocksizeI, ru-rl)];
 		
 		//blocked execution over IKJ 
 		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
@@ -1429,7 +1429,7 @@ public class LibMatrixMult
 						(int)Math.pow((double)m*cd/m1.nonZeros,2)));
 				
 				//temporary array of current sparse positions
-				int[] curk = new int[blocksizeI];
+				int[] curk = new int[Math.min(blocksizeI, ru-rl)];
 				
 				//blocked execution over IK 
 				for( int bi = rl; bi < ru; bi+=blocksizeI ) {

[16/50] [abbrv] systemml git commit: [SYSTEMML-1970] Performance conv2d-backward-filter (for sparse filter)

Posted by re...@apache.org.

[SYSTEMML-1970] Performance conv2d-backward-filter (for sparse filter)

This patch makes a number of performance improvements to sparse and
dense conv2d backward filter:

1) Conv2d backward w/ sparse filter: So far the rotate180 per input row,
converted any dense or sparse input into dense intermediates, which are
then fed into the matrix multiplication. We now rotate sparse filters
into sparse intermediates, which gives very good mm improvements due to
sparse-dense matrix multiplication. This patch also fixes the used
sparsity of related tests, which so far never created sparse inputs.

2) Minor dense conv2d improvements: Additional improvements includes the
removal of unnecessary allocations of matrix multiplication outputs,
more efficient output accumulation, and cache-conscious transpose
addition operations.

On an end-to-end cnn application, this patch improved the runtime from
610s to 498s per epoch.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b261661a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b261661a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b261661a

Branch: refs/heads/master
Commit: b261661a834bfaef1eacc7fa0a14e885811082a1
Parents: 311e4aa
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 02:04:49 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 02:04:49 2017 -0700

----------------------------------------------------------------------
 .../LibMatrixDNNConv2dBackwardDataHelper.java   |   2 +-
 .../LibMatrixDNNConv2dBackwardFilterHelper.java |  71 +++++----
 .../runtime/matrix/data/LibMatrixDNNHelper.java |   8 +-
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   |  48 +++---
 .../data/LibMatrixDNNRotate180Helper.java       |  67 ++++-----
 .../functions/tensor/Conv2DBackwardTest.java    | 147 ++++++++-----------
 .../functions/tensor/Conv2DBackwardTest.R       |   4 +-
 .../functions/tensor/Conv2DBackwardTest.dml     |   4 +-
 8 files changed, 162 insertions(+), 189 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
index 609af11..04c13e6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
@@ -81,7 +81,7 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
 			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
 			dout_reshaped.allocateDenseBlock();
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true);
+					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// rotate180(dout[n,]) => dout_reshaped

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index b89be82..de45b81 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -22,7 +22,7 @@ import java.util.concurrent.Callable;
 
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180Helper.Rotate180Worker;
 import org.apache.sysml.utils.NativeHelper;
 
 public class LibMatrixDNNConv2dBackwardFilterHelper {
@@ -43,12 +43,13 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 		
 		@Override
 		public Long call() throws Exception {
-			int CRS = _params.C*_params.R*_params.S; 
-			double [] dout_n = new double[_params.P*_params.Q*_params.K];
+			int CRS = _params.C*_params.R*_params.S, PQ = _params.P*_params.Q, K = _params.K;
+			MatrixBlock dout_n = new MatrixBlock(PQ, K, false);
+			dout_n.allocateBlock();
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true);
-			// partialRetBlock is size: [params.C*params.R*params.S, params.K]
-			double [] partialRetBlock = new double[CRS*_params.K];
+					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true, false);
+			double [] ldout_n = dout_n.getDenseBlock();
+			double [] partRet = new double[CRS*_params.K]; //CRS x K
 			for(int n = _rl; n < _ru; n++) {
 				if( !_params.input1.getSparseBlock().isEmpty(n) ) {
 					// rotate180(dout[n,]) => dout_n
@@ -59,11 +60,11 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 					int[] aix = _params.input1.getSparseBlock().indexes(n);
 					double[] avals = _params.input1.getSparseBlock().values(n);
 					NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
-							dout_n, partialRetBlock, 1, _params.C, _params.H, _params.W, _params.K, 
+							ldout_n, partRet, 1, _params.C, _params.H, _params.W, _params.K, 
 							_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
 				}
 			}
-			inplaceTransposedAddition(partialRetBlock, _params);
+			inplaceTransAdd(partRet, _params);
 			return 0L;
 		}
 	}
@@ -72,9 +73,9 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 	 * General conv2d backward data operator
 	 */
 	public static class Conv2dBackwardFilter implements Callable<Long> {
-
-		public int _rl; public int _ru; 
+		private final int _rl, _ru; 
 		private final ConvolutionParameters _params; 
+		
 		public Conv2dBackwardFilter(int rl, int ru, ConvolutionParameters params) {
 			_rl = rl; _ru = ru;
 			_params = params;
@@ -82,15 +83,17 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 		
 		@Override
 		public Long call() throws Exception {
-			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
+			int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
 			MatrixBlock dout = _params.input2;
 			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
-			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
-			dout_reshaped.allocateDenseBlock();
+			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, dout.sparse);
+			MatrixBlock temp = new MatrixBlock(CRS, K, false);
+			dout_reshaped.allocateBlock();
+			temp.allocateBlock();
+			
 			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, false);
-			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped.getDenseBlock(), _params, true);
-			double [] partialRetBlock = new double[CRS*_params.K];
+			Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
+			double [] partRet = new double[CRS*_params.K];
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++) {
 				// rotate180(dout[n,]) => dout_reshaped
@@ -101,22 +104,19 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 				im2ColWorker.execute(n);
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				MatrixBlock temp = new MatrixBlock(CRS, K, false);
+				temp.reset(CRS, K, false);
 				LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				if(!temp.isEmptyBlock()) {
-					// partialRetBlock is size: [params.C*params.R*params.S, params.K]
-					ConvolutionUtils.binaryOperationInPlace(temp, partialRetBlock, 0, K, 0, CRS, 
-							LibMatrixDNN._binaryElementWiseAddition);
-				}
+				if( !temp.isEmptyBlock() ) //accumulate row results
+					LibMatrixMult.vectAdd(temp.getDenseBlock(), partRet, 0, 0, K*CRS);
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
 					time1 += t2 - t1;
 					time2 += t3 - t2;
 				}
 			}
-			inplaceTransposedAddition(partialRetBlock, _params);
+			inplaceTransAdd(partRet, _params);
 			if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
 				LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1);
 				LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2);
@@ -124,15 +124,22 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 			return 0L;
 		}
 	}
-	private static synchronized void inplaceTransposedAddition(double [] partialRetBlock, ConvolutionParameters params) {
-		// Perform transposed addition: output of size [K, CRS] += partialRetBlock of size [CRS,K]
-		int iter = 0; int CRS = params.C*params.R*params.S; int K = params.K;
-		double [] outputArr = params.output.denseBlock;
-		for(int i = 0; i < CRS; i++) {
-			for(int j = 0; j < K; j++, iter++) {
-				int index = j*CRS+i;
-				outputArr[index] += partialRetBlock[iter];
+	
+	private static synchronized void inplaceTransAdd(double[] a, ConvolutionParameters params) {
+		// Perform transposed addition: output of size [K, CRS] += input of size [CRS,K]
+		double [] c = params.output.denseBlock;
+		final int CRS = params.C*params.R*params.S, K = params.K;
+		final int blocksizeIJ = 128; //L2 cache
+		
+		//cache-conscious blocked execution
+		for( int bi=0; bi<CRS; bi+=blocksizeIJ )
+			for( int bj=0; bj<K; bj+=blocksizeIJ ) {
+				int bimin = Math.min(bi+blocksizeIJ, CRS);
+				int bjmin = Math.min(bj+blocksizeIJ, K);
+				//core transpose add operation
+				for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K)
+					for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS)
+						c[cix] += a[aix+j];
 			}
-		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index b80a786..6117b90 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -24,6 +24,8 @@ import java.util.concurrent.Callable;
 
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
@@ -169,13 +171,13 @@ public class LibMatrixDNNHelper {
 		int taskSize = (int)(Math.ceil((double)params.N / k));
 		
 		boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || 
-																(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+			(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)) 
-				ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput)
-				ret.add(new LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else
 				throw new DMLRuntimeException("Unsupported operator");
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index 7b43257..a4a6d3d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -71,12 +71,11 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy
 	 */
 	static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+		private final double [] inputArray, outputArray; 
+		private final int S, R, P, Q, CHW, H, W;
 		public DenseIm2colWorkerStride1Pad0(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
-			this.CRS = params.C * params.R * params.S;
 			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.CHW = params.C*params.H*params.W;
 		}
@@ -100,10 +99,8 @@ public class LibMatrixDNNIm2ColHelper {
 					System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q);
 					int w = Q - 1;
 					int wPadded = w + wOffset;
-					if (hPadded < H && wPadded < W)
-						outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-					else
-						outputArray[outOffset + w] = 0;
+					boolean assign = (hPadded < H && wPadded < W);
+					outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 				}
 			}
 		}
@@ -115,8 +112,8 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy
 	 */
 	static class DenseIm2colWorkerStride1Pad0AllChannels implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W;
+		private final double [] inputArray, outputArray; 
+		private final int CRS, S, R, P, Q, CHW, H, W;
 		public DenseIm2colWorkerStride1Pad0AllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
@@ -144,10 +141,8 @@ public class LibMatrixDNNIm2ColHelper {
 					System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q);
 					int w = Q - 1;
 					int wPadded = w + wOffset;
-					if (hPadded < H && wPadded < W)
-						outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-					else
-						outputArray[outOffset + w] = 0;
+					boolean assign = (hPadded < H && wPadded < W);
+					outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 				}
 			}
 		}
@@ -157,13 +152,12 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Performing dense im2col (general case)
 	 */
 	static class DenseIm2colWorker implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
-		int stride_h; int stride_w; int pad_h; int pad_w;
+		private final double [] inputArray, outputArray; 
+		private final int S, R, P, Q, CHW, H, W; 
+		private final int stride_h, stride_w, pad_h, pad_w;
 		public DenseIm2colWorker(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
-			this.CRS = params.C * params.R * params.S;
 			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.CHW = params.C*params.H*params.W;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
@@ -190,10 +184,8 @@ public class LibMatrixDNNIm2ColHelper {
 					} else {
 						for (int w = 0; w < Q; ++w) {
 							int wPadded = w * stride_w - pad_w + wOffset;
-							if (wPadded >= 0 && wPadded < W)
-								outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-							else
-								outputArray[outOffset + w] = 0;
+							boolean assign = (wPadded >= 0 && wPadded < W);
+							outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 						}
 					}
 				}
@@ -205,9 +197,9 @@ public class LibMatrixDNNIm2ColHelper {
 	 * Performing dense im2col (general case)
 	 */
 	private static class DenseIm2colWorkerAllChannels implements Im2colWorker {
-		double [] inputArray; double [] outputArray; 
-		int CRS; int S; int R; int P; int Q; int CHW; int H; int W; 
-		int stride_h; int stride_w; int pad_h; int pad_w;
+		private final double[] inputArray, outputArray; 
+		private final int CRS, S, R, P, Q, CHW, H, W; 
+		private final int stride_h, stride_w, pad_h, pad_w;
 		public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) {
 			this.inputArray = inputArray;
 			this.outputArray = outputArray;
@@ -239,10 +231,8 @@ public class LibMatrixDNNIm2ColHelper {
 					} else {
 						for (int w = 0; w < Q; ++w) {
 							int wPadded = w * stride_w - pad_w + wOffset;
-							if (wPadded >= 0 && wPadded < W)
-								outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
-							else
-								outputArray[outOffset + w] = 0;
+							boolean assign = (wPadded >= 0 && wPadded < W);
+							outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0;
 						}
 					}
 				}
@@ -314,7 +304,7 @@ public class LibMatrixDNNIm2ColHelper {
 		private final MatrixBlock input, output;
 		private final int S, R, P, Q, W, HW;
 		private final int stride_h, stride_w, pad_h, pad_w; 
-		final boolean trans;
+		private final boolean trans;
 		
 		public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) {
 			this.input = input;

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
index 6bc7caf..7a71ced 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNRotate180Helper.java
@@ -18,8 +18,6 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
-import java.util.Arrays;
-
 /**
  * This class contains the different implementation of rotate180 operation
  */
@@ -27,11 +25,12 @@ public class LibMatrixDNNRotate180Helper {
 
 	static interface Rotate180Worker {
 		public void execute(int inputN, int outputN);
-		public static Rotate180Worker getWorker(MatrixBlock input, double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) {
-			if(!input.isInSparseFormat()) 
-				return new DenseRotate180Worker(input, outputArray, params);
+		public static Rotate180Worker getWorker(MatrixBlock in, MatrixBlock out, 
+			ConvolutionParameters params, boolean zeroOutSparseOutput, boolean trans) {
+			if(!in.isInSparseFormat()) 
+				return new DenseRotate180Worker(in, out.getDenseBlock(), params);
 			else
-				return new SparseRotate180Worker(input, outputArray, params, zeroOutSparseOutput);
+				return new SparseRotate180Worker(in, out, params, trans);
 		}
 	}
 	
@@ -71,39 +70,41 @@ public class LibMatrixDNNRotate180Helper {
 	 * Because the number of rows of output (i.e. NPQ) is much larger than number of columns (i.e. K) 
 	 */
 	static class SparseRotate180Worker implements Rotate180Worker {
-
-		double [] outputArray;  MatrixBlock input;
-		ConvolutionParameters params; boolean zeroOutSparseOutput;
-		public SparseRotate180Worker(MatrixBlock input, double [] outputArray,  ConvolutionParameters params, boolean zeroOutSparseOutput) {
-			this.outputArray = outputArray;
+		private final MatrixBlock in, out;
+		private final ConvolutionParameters params;
+		private final boolean trans;
+		
+		public SparseRotate180Worker(MatrixBlock input, MatrixBlock output, 
+			ConvolutionParameters params, boolean trans) {
+			this.in = input;
+			this.out = output;
 			this.params = params;
-			this.zeroOutSparseOutput = zeroOutSparseOutput;
-			this.input = input;
-			if(outputArray == null)
-				throw new RuntimeException("Incorrect usage: empty inputs");
+			this.trans = trans;
 		}
 		
 		@Override
 		public void execute(int inputN, int outputN) {
-			if(zeroOutSparseOutput)
-				Arrays.fill(outputArray, 0);
+			out.reset();
 			
-			int outputOffset = outputN*params.K*params.P*params.Q;
-			if(!input.isEmptyBlock()) {
-				if( !input.sparseBlock.isEmpty(inputN) ) {
-					int [] tensorIndexes = new int[3];
-					int apos = input.sparseBlock.pos(inputN);
-					int alen = input.sparseBlock.size(inputN);
-					int[] aix = input.sparseBlock.indexes(inputN);
-					double[] avals = input.sparseBlock.values(inputN);
-					for(int j = apos; j < apos+alen; j++) {
-						LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q);
-						int k = tensorIndexes[0];
-						int p = tensorIndexes[1];
-						int q = tensorIndexes[2];
-						outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = avals[j];
-					}
-				}
+			SparseBlock sblock = in.sparseBlock;
+			if( sblock==null || sblock.isEmpty(inputN) )
+				return;
+			
+			int outputOffset = outputN*params.P*params.Q;
+			int [] tensorIndexes = new int[3];
+			int apos = sblock.pos(inputN);
+			int alen = sblock.size(inputN);
+			int[] aix = sblock.indexes(inputN);
+			double[] avals = sblock.values(inputN);
+			for(int j = apos; j < apos+alen; j++) {
+				LibMatrixDNNHelper.computeTensorIndexes(aix[j], tensorIndexes, params.P, params.Q);
+				int k = tensorIndexes[0];
+				int p = tensorIndexes[1];
+				int q = tensorIndexes[2];
+				if( trans )
+					out.appendValue(k, outputOffset + p*params.Q + q, avals[j]);
+				else
+					out.appendValue(outputOffset + p*params.Q + q, k, avals[j]);
 			}
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
index decca59..6a9528b 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java
@@ -32,7 +32,6 @@ import org.junit.Test;
 
 public class Conv2DBackwardTest extends AutomatedTestBase
 {
-	
 	private final static String TEST_NAME = "Conv2DBackwardTest";
 	private final static String TEST_DIR = "functions/tensor/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + Conv2DBackwardTest.class.getSimpleName() + "/";
@@ -40,214 +39,188 @@ public class Conv2DBackwardTest extends AutomatedTestBase
 	
 	@Override
 	public void setUp() {
-		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
-				new String[] {"B"}));
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B"}));
 	}
 	
 	
 	@Test
-	public void testConv2DBackwardFilterDense1() 
-	{
+	public void testConv2DBackwardFilterDense1() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense2() 
-	{
+	public void testConv2DBackwardFilterDense2() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense3() 
-	{
+	public void testConv2DBackwardFilterDense3() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense4() 
-	{
+	public void testConv2DBackwardFilterDense4() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterDense5() 
-	{
+	public void testConv2DBackwardFilterDense5() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse1() 
-	{
+	public void testConv2DBackwardFilterSparse1() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse2() 
-	{
+	public void testConv2DBackwardFilterSparse2() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse3() 
-	{
+	public void testConv2DBackwardFilterSparse3() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse4() 
-	{
+	public void testConv2DBackwardFilterSparse4() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse5() 
-	{
+	public void testConv2DBackwardFilterSparse5() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse6() 
-	{
+	public void testConv2DBackwardFilterSparse6() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse7() 
-	{
+	public void testConv2DBackwardFilterSparse7() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse8() 
-	{
+	public void testConv2DBackwardFilterSparse8() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse9() 
-	{
+	public void testConv2DBackwardFilterSparse9() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse10() 
-	{
+	public void testConv2DBackwardFilterSparse10() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse11() 
-	{
+	public void testConv2DBackwardFilterSparse11() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse12() 
-	{
+	public void testConv2DBackwardFilterSparse12() {
 		int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse13() 
-	{
+	public void testConv2DBackwardFilterSparse13() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse14() 
-	{
+	public void testConv2DBackwardFilterSparse14() {
 		int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
 	@Test
-	public void testConv2DBackwardFilterSparse15() 
-	{
+	public void testConv2DBackwardFilterSparse15() {
 		int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2;
 		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
-	/**
-	 * 
-	 * @param et
-	 * @param sparse
-	 */
+	@Test
+	public void testConv2DBackwardFilterSparse16() {
+		int numImg = 10; int imgSize = 40; int numChannels = 4; int numFilters = 30; int filterSize = 25; int stride = 1; int pad = 0;
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
+	}
+	
+	@Test
+	public void testConv2DBackwardFilterSparse17() {
+		int numImg = 10, imgSize = 40, numChannels = 4, numFilters = 30, filterSize = 25, stride = 1, pad = 0;
+		runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
+	}
+	
 	public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, 
-			int filterSize, int stride, int pad, boolean sparse1, boolean sparse2) 
+		int filterSize, int stride, int pad, boolean sparse1, boolean sparse2) 
 	{
-		RUNTIME_PLATFORM oldRTP = rtplatform;
-			
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+		
 		try
 		{
-			String sparseVal1 = (""+sparse1).toUpperCase();
-			String sparseVal2 = (""+sparse2).toUpperCase();
-			
-	    TestConfiguration config = getTestConfiguration(TEST_NAME);
-	    if(et == ExecType.SPARK) {
-	    	rtplatform = RUNTIME_PLATFORM.SPARK;
-	    }
-	    else {
-	    	rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
-	    }
-			if( rtplatform == RUNTIME_PLATFORM.SPARK )
-				DMLScript.USE_LOCAL_SPARK_CONFIG = true;
-			
+			String sparseVal1 = String.valueOf(sparse1).toUpperCase();
+			String sparseVal2 = String.valueOf(sparse2).toUpperCase();
+			long P = ConvolutionUtils.getP(imgSize, filterSize, stride, pad);
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
 			loadTestConfiguration(config);
-	        
-			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			
 			String RI_HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-			
-			
-			long P = ConvolutionUtils.getP(imgSize, filterSize, stride, pad);
-			
-			programArgs = new String[]{"-explain", "-args",  "" + imgSize, "" + numImg, 
-				"" + numChannels, "" + numFilters, 
-				"" + filterSize, "" + stride, "" + pad,
-				"" + P, "" + P, 
-				output("B"), sparseVal1, sparseVal2};
-			        
-			boolean exceptionExpected = false;
-			int expectedNumberOfJobs = -1;
-			runTest(true, exceptionExpected, null, expectedNumberOfJobs);
-			
+			programArgs = new String[]{"-explain", "-args", 
+				String.valueOf(imgSize), String.valueOf(numImg), 
+				String.valueOf(numChannels), String.valueOf(numFilters), 
+				String.valueOf(filterSize), String.valueOf(stride), String.valueOf(pad), 
+				String.valueOf(P), String.valueOf(P), output("B"), sparseVal1, sparseVal2};
 			fullRScriptName = RI_HOME + TEST_NAME + ".R";
 			rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + 
 					" " + numChannels + " " + numFilters + 
 					" " + filterSize + " " + stride + " " + pad + " " + P + " " + P + " " + expectedDir() +
 					" " + sparseVal1 + " " + sparseVal2;
-			// Run comparison R script
+			
+			// Run DML and R scripts
+			runTest(true, false, null, -1);
 			runRScript(true);
-			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			
+			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
 			TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy");
-			
 		}
-		finally
-		{
-			rtplatform = oldRTP;
+		finally {
+			rtplatform = platformOld;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}
-	
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
index 7319da7..3964e17 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
@@ -35,13 +35,13 @@ x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize
 dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
 
 if(as.logical(args[11])) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 } else {
 	x = x - mean(x)
 }
 if(as.logical(args[12])) {
-	zero_mask = (dout - mean(dout)) > 0 
+	zero_mask = (dout - mean(dout)*1.5) > 0 
 	dout = dout * zero_mask
 } else {
 	dout = dout - mean(dout)

http://git-wip-us.apache.org/repos/asf/systemml/blob/b261661a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
index fb14c1c..b17281e 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
@@ -33,14 +33,14 @@ Q = $9
 x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, cols=numChannels*imgSize*imgSize)
 dout=matrix(seq(1, numImg*numFilters*P*Q), rows=numImg, cols=numFilters*P*Q)
 if($11) {
-	zero_mask = (x - mean(x)) > 0 
+	zero_mask = (x - mean(x)*1.5) > 0 
 	x = x * zero_mask
 }
 else {
 	x = x - mean(x)
 }
 if($12) {
-	zero_mask = (dout - mean(dout)) > 0 
+	zero_mask = (dout - mean(dout)*1.5) > 0 
 	dout = dout * zero_mask
 }
 else {

[50/50] [abbrv] systemml git commit: [SYSTEMML-1984] Robustness JMLC prepared scripts (outputs and configs)

Posted by re...@apache.org.

[SYSTEMML-1984] Robustness JMLC prepared scripts (outputs and configs)

This patch fixes the robustness of JMLC prepared scripts for deployments
where prepared scripts are not necessarily created and executed by the
same thread. Accordingly, we no longer maintain output variables (for
dynamic recompilation) and compiler configurations in a thread-local
manner, but attach the output variables to the symbol table as well as
set and cleanup thread-local compiler configurations individually at
connection level and on prepared script execute.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0d467220
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0d467220
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0d467220

Branch: refs/heads/master
Commit: 0d4672207185ac183d22e157171a58c5f38a5bb0
Parents: e1f5866
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Nov 2 20:02:22 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Nov 2 20:02:22 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/api/jmlc/Connection.java   | 36 ++++++-------
 .../org/apache/sysml/api/jmlc/JMLCProxy.java    | 55 --------------------
 .../apache/sysml/api/jmlc/PreparedScript.java   | 39 ++++++++++----
 .../apache/sysml/api/mlcontext/MLContext.java   | 11 ----
 .../sysml/api/mlcontext/ScriptExecutor.java     |  8 +--
 .../apache/sysml/hops/recompile/Recompiler.java | 13 ++---
 .../controlprogram/LocalVariableMap.java        | 25 ++++++---
 .../org/apache/sysml/utils/MLContextProxy.java  | 11 +---
 .../functions/jmlc/MulticlassSVMScoreTest.java  | 41 +++++++++------
 9 files changed, 102 insertions(+), 137 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/api/jmlc/Connection.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
index f933396..4caa86c 100644
--- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java
+++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
@@ -95,9 +95,10 @@ import org.apache.wink.json4j.JSONObject;
  * </ul>
  */
 public class Connection implements Closeable
-{		
-	private DMLConfig _dmlconf = null;
-
+{
+	private final DMLConfig _dmlconf;
+	private final CompilerConfig _cconf;
+	
 	/**
 	 * Connection constructor, the starting point for any other JMLC API calls.
 	 * 
@@ -122,14 +123,17 @@ public class Connection implements Closeable
 		cconf.set(ConfigType.ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS, false);
 		cconf.set(ConfigType.ALLOW_CSE_PERSISTENT_READS, false);
 		cconf.set(ConfigType.CODEGEN_ENABLED, false);
-		ConfigurationManager.setLocalConfig(cconf);
+		_cconf = cconf;
 		
 		//disable caching globally 
 		CacheableData.disableCaching();
 		
-		//create thread-local default configuration
+		//create default configuration
 		_dmlconf = new DMLConfig();
+		
+		//set thread-local configurations for compilation
 		ConfigurationManager.setLocalConfig(_dmlconf);
+		ConfigurationManager.setLocalConfig(_cconf);
 	}
 	
 	/**
@@ -143,10 +147,12 @@ public class Connection implements Closeable
 		this();
 		
 		//set optional compiler configurations in current config
-		CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
 		for( ConfigType configType : configs )
-			cconf.set(configType, true);
-		ConfigurationManager.setLocalConfig(cconf);
+			_cconf.set(configType, true);
+		
+		//set thread-local configurations for compilation
+		ConfigurationManager.setLocalConfig(_dmlconf);
+		ConfigurationManager.setLocalConfig(_cconf);
 	}
 	
 	/**
@@ -202,7 +208,7 @@ public class Connection implements Closeable
 			
 			//language validate
 			DMLTranslator dmlt = new DMLTranslator(prog);
-			dmlt.liveVariableAnalysis(prog);			
+			dmlt.liveVariableAnalysis(prog);
 			dmlt.validateParseTree(prog);
 			
 			//hop construct/rewrite
@@ -220,10 +226,6 @@ public class Connection implements Closeable
 			
 			//final cleanup runtime prog
 			JMLCUtils.cleanupRuntimeProgram(rtprog, outputs);
-			
-			//activate thread-local proxy for dynamic recompilation
-			if( ConfigurationManager.isDynamicRecompilation() )
-				JMLCProxy.setActive(outputs);
 		}
 		catch(ParseException pe) {
 			// don't chain ParseException (for cleaner error output)
@@ -232,9 +234,9 @@ public class Connection implements Closeable
 		catch(Exception ex) {
 			throw new DMLException(ex);
 		}
-			
+		
 		//return newly create precompiled script 
-		return new PreparedScript(rtprog, inputs, outputs);
+		return new PreparedScript(rtprog, inputs, outputs, _dmlconf, _cconf);
 	}
 	
 	/**
@@ -243,10 +245,8 @@ public class Connection implements Closeable
 	 */
 	@Override
 	public void close() {
-		//clear thread-local dml / compiler configs
+		//clear thread-local configurations
 		ConfigurationManager.clearLocalConfigs();
-		if( ConfigurationManager.isDynamicRecompilation() )
-			JMLCProxy.setActive(null);
 		if( ConfigurationManager.isCodegenEnabled() )
 			SpoofCompiler.cleanupCodeGenerator();
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/api/jmlc/JMLCProxy.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/jmlc/JMLCProxy.java b/src/main/java/org/apache/sysml/api/jmlc/JMLCProxy.java
deleted file mode 100644
index 400380c..0000000
--- a/src/main/java/org/apache/sysml/api/jmlc/JMLCProxy.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.api.jmlc;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-
-import org.apache.sysml.runtime.instructions.Instruction;
-
-/**
- * This proxy provides thread-local access to output variables per connection
- * in order to enable dynamic recompilation in JMLC.
- */
-public class JMLCProxy
-{
-	private static ThreadLocal<HashSet<String>> _outputs = new ThreadLocal<HashSet<String>>() {
-		@Override 
-		protected HashSet<String> initialValue() { 
-			return null;
-		}
-	};
-	
-	public static void setActive(String[] output) {
-		if( output != null )
-			_outputs.set(new HashSet<>(Arrays.asList(output)));
-		else
-			_outputs.remove();
-	}
-
-	public static boolean isActive() {
-		return (_outputs.get() != null);
-	}
-
-	public static ArrayList<Instruction> performCleanupAfterRecompilation(ArrayList<Instruction> tmp) {
-		return JMLCUtils.cleanupRuntimeInstructions(tmp, _outputs.get());
-	}
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/api/jmlc/PreparedScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/jmlc/PreparedScript.java b/src/main/java/org/apache/sysml/api/jmlc/PreparedScript.java
index 6cd041c..f14d3a1 100644
--- a/src/main/java/org/apache/sysml/api/jmlc/PreparedScript.java
+++ b/src/main/java/org/apache/sysml/api/jmlc/PreparedScript.java
@@ -29,6 +29,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLException;
 import org.apache.sysml.conf.CompilerConfig;
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.conf.CompilerConfig.ConfigType;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.ipa.FunctionCallGraph;
@@ -64,13 +65,15 @@ public class PreparedScript
 	private static final Log LOG = LogFactory.getLog(PreparedScript.class.getName());
 	
 	//input/output specification
-	private HashSet<String> _inVarnames = null;
-	private HashSet<String> _outVarnames = null;
-	private HashMap<String,Data> _inVarReuse = null;
+	private final HashSet<String> _inVarnames;
+	private final HashSet<String> _outVarnames;
+	private final HashMap<String,Data> _inVarReuse;
 	
 	//internal state (reused)
-	private Program _prog = null;
-	private LocalVariableMap _vars = null; 
+	private final Program _prog;
+	private final LocalVariableMap _vars;
+	private final DMLConfig _dmlconf;
+	private final CompilerConfig _cconf;
 	
 	/**
 	 * Meant to be invoked only from Connection.
@@ -78,8 +81,10 @@ public class PreparedScript
 	 * @param prog the DML/PyDML program
 	 * @param inputs input variables to register
 	 * @param outputs output variables to register
+	 * @param dmlconf dml configuration 
+	 * @param cconf compiler configuration
 	 */
-	protected PreparedScript( Program prog, String[] inputs, String[] outputs ) 
+	protected PreparedScript( Program prog, String[] inputs, String[] outputs, DMLConfig dmlconf, CompilerConfig cconf ) 
 	{
 		_prog = prog;
 		_vars = new LocalVariableMap();
@@ -90,6 +95,14 @@ public class PreparedScript
 		_outVarnames = new HashSet<>();
 		Collections.addAll(_outVarnames, outputs);
 		_inVarReuse = new HashMap<>();
+		
+		//attach registered outputs (for dynamic recompile)
+		_vars.setRegisteredOutputs(_outVarnames);
+		
+		//keep dml and compiler configuration to be set as thread-local config
+		//on execute, which allows different threads creating/executing the script
+		_dmlconf = dmlconf;
+		_cconf = cconf;
 	}
 	
 	/**
@@ -386,11 +399,15 @@ public class PreparedScript
 		//add reused variables
 		_vars.putAll(_inVarReuse);
 		
+		//set thread-local configurations
+		ConfigurationManager.setLocalConfig(_dmlconf);
+		ConfigurationManager.setLocalConfig(_cconf);
+		
 		//create and populate execution context
-		ExecutionContext ec = ExecutionContextFactory.createContext(_vars, _prog);	
+		ExecutionContext ec = ExecutionContextFactory.createContext(_vars, _prog);
 		
-		//core execute runtime program	
-		_prog.execute(ec);  
+		//core execute runtime program
+		_prog.execute(ec);
 		
 		//cleanup unnecessary outputs
 		_vars.removeAllNotIn(_outVarnames);
@@ -402,6 +419,10 @@ public class PreparedScript
 			if( tmpVar != null )
 				rvars.addResult(ovar, tmpVar);
 		}
+		
+		//clear thread-local configurations
+		ConfigurationManager.clearLocalConfigs();
+		
 		return rvars;
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/api/mlcontext/MLContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContext.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContext.java
index 762db8f..6fe66ff 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContext.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContext.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysml.api.mlcontext;
 
-import java.util.ArrayList;
 import java.util.Date;
 import java.util.Set;
 
@@ -28,7 +27,6 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.api.jmlc.JMLCUtils;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.DataExpression;
@@ -39,7 +37,6 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
-import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
@@ -590,14 +587,6 @@ public class MLContext {
 			}
 			throw new MLContextException("getMatrixObject not set for parameter: " + parameterName);
 		}
-
-		public ArrayList<Instruction> performCleanupAfterRecompilation(ArrayList<Instruction> instructions) {
-			if (executionScript == null || executionScript.getOutputVariables() == null)
-				return instructions;
-
-			Set<String> outputVariableNames = executionScript.getOutputVariables();
-			return JMLCUtils.cleanupRuntimeInstructions(instructions, outputVariableNames.toArray(new String[0]));
-		}
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
index f4f8803..aa2364d 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.api.mlcontext;
 
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 
@@ -230,10 +231,11 @@ public class ScriptExecutor {
 	protected void createAndInitializeExecutionContext() {
 		executionContext = ExecutionContextFactory.createContext(runtimeProgram);
 		LocalVariableMap symbolTable = script.getSymbolTable();
-		if (symbolTable != null) {
+		if (symbolTable != null)
 			executionContext.setVariables(symbolTable);
-		}
-
+		//attach registered outputs (for dynamic recompile)
+		executionContext.getVariables().setRegisteredOutputs(
+			new HashSet<String>(script.getOutputVariables()));
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java b/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
index 463e41b..3c1f6c9 100644
--- a/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
+++ b/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
@@ -32,7 +32,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.wink.json4j.JSONObject;
 import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.api.jmlc.JMLCProxy;
+import org.apache.sysml.api.jmlc.JMLCUtils;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.CompilerConfig.ConfigType;
 import org.apache.sysml.hops.DataGenOp;
@@ -105,7 +105,6 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Explain.ExplainType;
 import org.apache.sysml.utils.JSONHelper;
-import org.apache.sysml.utils.MLContextProxy;
 
 /**
  * Dynamic recompilation of hop dags to runtime instructions, which includes the 
@@ -252,19 +251,17 @@ public class Recompiler
 			newInst = ProgramConverter.createDeepCopyInstructionSet(newInst, tid, -1, null, null, null, false, false);
 		
 		// remove writes if called through mlcontext or jmlc 
-		if( MLContextProxy.isActive() )
-			newInst = MLContextProxy.performCleanupAfterRecompilation(newInst);
-		else if( JMLCProxy.isActive() )
-			newInst = JMLCProxy.performCleanupAfterRecompilation(newInst);
+		if( vars.getRegisteredOutputs() != null )
+			newInst = JMLCUtils.cleanupRuntimeInstructions(newInst, vars.getRegisteredOutputs());
 		
 		// explain recompiled hops / instructions
 		if( DMLScript.EXPLAIN == ExplainType.RECOMPILE_HOPS ){
 			LOG.info("EXPLAIN RECOMPILE \nGENERIC (lines "+sb.getBeginLine()+"-"+sb.getEndLine()+"):\n" + 
-		    Explain.explainHops(hops, 1));
+			Explain.explainHops(hops, 1));
 		}
 		if( DMLScript.EXPLAIN == ExplainType.RECOMPILE_RUNTIME ){
 			LOG.info("EXPLAIN RECOMPILE \nGENERIC (lines "+sb.getBeginLine()+"-"+sb.getEndLine()+"):\n" + 
-		    Explain.explain(newInst, 1));
+			Explain.explain(newInst, 1));
 		}
 	
 		return newInst;

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/runtime/controlprogram/LocalVariableMap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/LocalVariableMap.java b/src/main/java/org/apache/sysml/runtime/controlprogram/LocalVariableMap.java
index 0743d39..7ebe1a0 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/LocalVariableMap.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/LocalVariableMap.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.runtime.controlprogram;
 
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
@@ -37,13 +38,17 @@ import org.apache.sysml.runtime.instructions.cp.Data;
  */
 public class LocalVariableMap implements Cloneable
 {
-	private static String eol = System.getProperty ("line.separator");
-	private static String ELEMENT_DELIM = org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter.ELEMENT_DELIM;
-	private static IDSequence _seq = new IDSequence();
+	private static final String eol = System.getProperty ("line.separator");
+	private static final String ELEMENT_DELIM = ProgramConverter.ELEMENT_DELIM;
+	private static final IDSequence _seq = new IDSequence();
 	
-	private HashMap <String, Data> localMap = null;
+	//variable map data and id
+	private final HashMap<String, Data> localMap;
 	private final long localID;
 	
+	//optional set of registered outputs
+	private HashSet<String> outputs = null;
+	
 	public LocalVariableMap() {
 		localMap = new HashMap<>();
 		localID = _seq.getNextID();
@@ -104,6 +109,14 @@ public class LocalVariableMap implements Cloneable
 	public boolean hasReferences( Data d ) {
 		return localMap.containsValue(d);
 	}
+	
+	public void setRegisteredOutputs(HashSet<String> outputs) {
+		this.outputs = outputs;
+	}
+	
+	public HashSet<String> getRegisteredOutputs() {
+		return outputs;
+	}
 
 	public String serialize() throws DMLRuntimeException {
 		StringBuilder sb = new StringBuilder();
@@ -115,7 +128,7 @@ public class LocalVariableMap implements Cloneable
 				.serializeDataObject(e.getKey(), e.getValue()));
 			count++;
 		}
-		return sb.toString();		
+		return sb.toString();
 	}
 
 	public static LocalVariableMap deserialize(String varStr) 
@@ -128,7 +141,7 @@ public class LocalVariableMap implements Cloneable
 			Object[] tmp2 = ProgramConverter.parseDataObject (tmp);
 			vars.put((String) tmp2 [0], (Data) tmp2 [1]);
 		}
-		return vars;		
+		return vars;
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/main/java/org/apache/sysml/utils/MLContextProxy.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/MLContextProxy.java b/src/main/java/org/apache/sysml/utils/MLContextProxy.java
index 825d42a..d67abac 100644
--- a/src/main/java/org/apache/sysml/utils/MLContextProxy.java
+++ b/src/main/java/org/apache/sysml/utils/MLContextProxy.java
@@ -19,13 +19,10 @@
 
 package org.apache.sysml.utils;
 
-import java.util.ArrayList;
-
 import org.apache.sysml.api.mlcontext.MLContext;
 import org.apache.sysml.api.mlcontext.MLContextException;
 import org.apache.sysml.parser.Expression;
 import org.apache.sysml.parser.LanguageException;
-import org.apache.sysml.runtime.instructions.Instruction;
 
 /**
  * The purpose of this proxy is to shield systemml internals from direct access to MLContext
@@ -46,12 +43,7 @@ public class MLContextProxy
 	public static boolean isActive() {
 		return _active;
 	}
-
-	public static ArrayList<Instruction> performCleanupAfterRecompilation(ArrayList<Instruction> tmp)
-	{
-		return MLContext.getActiveMLContext().getInternalProxy().performCleanupAfterRecompilation(tmp);
-	}
-
+	
 	public static void setAppropriateVarsForRead(Expression source, String targetname)
 		throws LanguageException
 	{
@@ -69,5 +61,4 @@ public class MLContextProxy
 		throw new MLContextException("No MLContext object is currently active. Have you created one? "
 				+ "Hint: in Scala, 'val ml = new MLContext(sc)'", true);
 	}
-
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/0d467220/src/test/java/org/apache/sysml/test/integration/functions/jmlc/MulticlassSVMScoreTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/MulticlassSVMScoreTest.java b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/MulticlassSVMScoreTest.java
index 5304688..9a0602a 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/MulticlassSVMScoreTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/MulticlassSVMScoreTest.java
@@ -26,6 +26,7 @@ import java.util.HashMap;
 import org.apache.sysml.api.jmlc.Connection;
 import org.apache.sysml.api.jmlc.PreparedScript;
 import org.apache.sysml.api.jmlc.ResultVariables;
+import org.apache.sysml.conf.CompilerConfig.ConfigType;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.Timing;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -48,7 +49,7 @@ public class MulticlassSVMScoreTest extends AutomatedTestBase
 	private final static int rows = 107;
 	private final static int cols = 46; //fixed
 	
-	private final static int nRuns = 10;
+	private final static int nRuns = 5;
 	
 	private final static double sparsity1 = 0.7;
 	private final static double sparsity2 = 0.1;
@@ -59,25 +60,28 @@ public class MulticlassSVMScoreTest extends AutomatedTestBase
 	{
 		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] { "predicted_y" }) ); 
 	}
-
 	
 	@Test
-	public void testJMLCMulticlassScoreDense() 
-		throws IOException
-	{
-		//should apply diag_mm rewrite
-		runJMLCMulticlassTest(false);
+	public void testJMLCMulticlassScoreDense() throws IOException {
+		runJMLCMulticlassTest(false, false);
 	}
 	
 	@Test
-	public void testJMLCMulticlassScoreSparse() 
-		throws IOException
-	{
-		//should apply diag_mm rewrite
-		runJMLCMulticlassTest(true);
+	public void testJMLCMulticlassScoreSparse() throws IOException {
+		runJMLCMulticlassTest(true, false);
+	}
+	
+	@Test
+	public void testJMLCMulticlassScoreDenseFlags() throws IOException {
+		runJMLCMulticlassTest(false, true);
+	}
+	
+	@Test
+	public void testJMLCMulticlassScoreSparseFlags() throws IOException {
+		runJMLCMulticlassTest(true, true);
 	}
 
-	private void runJMLCMulticlassTest( boolean sparse ) 
+	private void runJMLCMulticlassTest( boolean sparse, boolean flags ) 
 		throws IOException
 	{	
 		TestConfiguration config = getTestConfiguration(TEST_NAME);
@@ -87,7 +91,7 @@ public class MulticlassSVMScoreTest extends AutomatedTestBase
 		ArrayList<double[][]> Xset = generateInputs(nRuns, rows, cols, sparse?sparsity2:sparsity1); 
 		
 		//run DML via JMLC
-		ArrayList<double[][]> Yset = execDMLScriptviaJMLC( Xset );
+		ArrayList<double[][]> Yset = execDMLScriptviaJMLC( Xset, flags );
 		
 		//run R and compare results to DML result
 		String HOME = SCRIPT_DIR + TEST_DIR;
@@ -117,7 +121,7 @@ public class MulticlassSVMScoreTest extends AutomatedTestBase
 		}
 	}
 
-	private static ArrayList<double[][]> execDMLScriptviaJMLC( ArrayList<double[][]> X) 
+	private static ArrayList<double[][]> execDMLScriptviaJMLC(ArrayList<double[][]> X, boolean flags) 
 		throws IOException
 	{
 		Timing time = new Timing(true);
@@ -125,8 +129,11 @@ public class MulticlassSVMScoreTest extends AutomatedTestBase
 		ArrayList<double[][]> ret = new ArrayList<double[][]>();
 		
 		//establish connection to SystemML
-		Connection conn = new Connection();
-				
+		Connection conn = !flags ? new Connection():
+			new Connection(ConfigType.PARALLEL_CP_MATRIX_OPERATIONS,
+				ConfigType.PARALLEL_LOCAL_OR_REMOTE_PARFOR,
+				ConfigType.ALLOW_DYN_RECOMPILATION);
+		
 		try
 		{
 			// For now, JMLC pipeline only allows dml

[03/50] [abbrv] systemml git commit: [MINOR] Fix consistency ALS datagen script (factor names, cleanup)

Posted by re...@apache.org.

[MINOR] Fix consistency ALS datagen script (factor names, cleanup)

This patch cleans up the ALS data generation script to use the same
factor names as ALS-CG and remove unnecessary operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06b4b9d5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06b4b9d5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06b4b9d5

Branch: refs/heads/master
Commit: 06b4b9d5ff04f09c61d44864d36a65b31527bbb3
Parents: 3355914
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 17:04:31 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 17:04:31 2017 -0700

----------------------------------------------------------------------
 scripts/datagen/genRandData4ALS.dml | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06b4b9d5/scripts/datagen/genRandData4ALS.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4ALS.dml b/scripts/datagen/genRandData4ALS.dml
index a5838d7..eb9a4a9 100644
--- a/scripts/datagen/genRandData4ALS.dml
+++ b/scripts/datagen/genRandData4ALS.dml
@@ -20,8 +20,8 @@
 #-------------------------------------------------------------
 
 Xfile = $X; # input matrix X of size m x n
-Wfile = $W; # original row factor of size m x r
-Hfile = $H; # original col factor of size r x n
+Ufile = $U; # original row factor of size m x r
+Vfile = $V; # original col factor of size r x n
 m = $rows; # no. of rows of X
 n = $cols; # no. of cols of X
 r = $rank; # rank of factorization
@@ -30,15 +30,18 @@ sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise
 fmt = ifdef ($fmt, "binary"); # output format
 
 # generate original factors by sampling from a normal(0,1.0) distribution
-W = rand(rows = m, cols = r, pdf = "normal", seed = 123);
-H = rand(rows = r, cols = n, pdf = "normal", seed = 456);
+U = rand(rows = m, cols = r, pdf = "normal", seed = 123);
+V = rand(rows = n, cols = r, pdf = "normal", seed = 456);
 
 I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999));
 J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999));
 X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma);
 N = table(I, J, X);
-T = (N != 0);
-X = T * (W %*% H) + T * N;
+X = (N != 0) * (U %*% t(V)) + N;
 write(X, Xfile, format = fmt);
-write(W, Wfile, format = fmt);
-write(H, Hfile, format = fmt);
+if( Ufile != " " )
+  write(U, Ufile, format = fmt);
+if( Vfile != " " ) {
+  V = t(V);
+  write(V, Vfile, format = fmt);
+}

[17/50] [abbrv] systemml git commit: [SYSTEMML-1970] Performance conv2d-backward-data (for sparse filter)

Posted by re...@apache.org.

[SYSTEMML-1970] Performance conv2d-backward-data (for sparse filter)

This patch follows-up on the recent modification of conv2d backward
filter, by similarly applying a sparse rotate for conv2d backward data.
Furthermore, this also includes the removal of unnecessary allocations
per input row, and thread-local nnz maintenance.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/78a3808e
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/78a3808e
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/78a3808e

Branch: refs/heads/master
Commit: 78a3808e0aaefb0c6f6959611ef119695d4d1d3e
Parents: b261661
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 17:57:29 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 17:57:29 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/LibMatrixDNN.java  |  4 ++--
 .../LibMatrixDNNConv2dBackwardDataHelper.java    | 19 ++++++++++---------
 .../LibMatrixDNNConv2dBackwardFilterHelper.java  | 17 ++++++++---------
 .../matrix/data/LibMatrixDNNConv2dHelper.java    |  2 +-
 .../runtime/matrix/data/LibMatrixDNNHelper.java  |  2 +-
 5 files changed, 22 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/78a3808e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index b967780..ac66e51 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -186,10 +186,10 @@ public class LibMatrixDNN {
 		if(isEligibleForConv2dBackwardDataDense(params))
 			Statistics.numNativeSparseConv2dBwdDataCalls.increment();
 		
-		execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params);
+		long nnz = execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params);
 		
 		//post-processing: maintain nnz
-		outputBlock.recomputeNonZeros(); 
+		outputBlock.setNonZeros(nnz);
 		outputBlock.examSparsity();
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/78a3808e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
index 04c13e6..cd50000 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
@@ -78,22 +78,22 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
 			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
 			MatrixBlock filter = _params.input1;
 			MatrixBlock dout = _params.input2;
-			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, false);
-			dout_reshaped.allocateDenseBlock();
+			MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse);
+			MatrixBlock outMM = new MatrixBlock(PQ, CRS, false);
+			outRotate.allocateBlock();
 			LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = 
-					LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
+				LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, outRotate, _params, true, false);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// rotate180(dout[n,]) => dout_reshaped
 				rotate180Worker.execute(n, 0);
-				
 				// dout_reshaped %*% filter => temp
-				MatrixBlock temp = new MatrixBlock(PQ, CRS, false);
 				long t1 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				LibMatrixDNNHelper.singleThreadedMatMult(dout_reshaped, filter, temp, true, false, _params);
+				outMM.reset(PQ, CRS, false);
+				LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, !outRotate.sparse, false, _params);
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				// col2im(temp) => output[n,] 
-				LibMatrixDNNHelper.doCol2imOverSingleImage(n, temp, _params);
+				LibMatrixDNNHelper.doCol2imOverSingleImage(n, outMM, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
@@ -105,8 +105,9 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
 				LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1);
 				LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2);
 			}
-			return 0L;
+			
+			//multi-threaded nnz maintenance of current working set
+			return _params.output.recomputeNonZeros(_rl, _ru-1);
 		}
-		
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/78a3808e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
index de45b81..f0fd002 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java
@@ -86,13 +86,12 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 			int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S;
 			MatrixBlock dout = _params.input2;
 			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
-			MatrixBlock dout_reshaped = new MatrixBlock(PQ, K, dout.sparse);
-			MatrixBlock temp = new MatrixBlock(CRS, K, false);
-			dout_reshaped.allocateBlock();
-			temp.allocateBlock();
+			MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse);
+			MatrixBlock outMM = new MatrixBlock(CRS, K, false);
+			outRotate.allocateBlock();
 			
 			Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, false);
-			Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, dout_reshaped, _params, true, false);
+			Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, false);
 			double [] partRet = new double[CRS*_params.K];
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++) {
@@ -104,12 +103,12 @@ public class LibMatrixDNNConv2dBackwardFilterHelper {
 				im2ColWorker.execute(n);
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				temp.reset(CRS, K, false);
-				LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, _params);
+				outMM.reset(CRS, K, false);
+				LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, !im2ColOutBlock.sparse, !outRotate.sparse, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
-				if( !temp.isEmptyBlock() ) //accumulate row results
-					LibMatrixMult.vectAdd(temp.getDenseBlock(), partRet, 0, 0, K*CRS);
+				if( !outMM.isEmptyBlock() ) //accumulate row results
+					LibMatrixMult.vectAdd(outMM.getDenseBlock(), partRet, 0, 0, K*CRS);
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
 					time1 += t2 - t1;

http://git-wip-us.apache.org/repos/asf/systemml/blob/78a3808e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index dd44de2..6a0205e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -219,7 +219,7 @@ public class LibMatrixDNNConv2dHelper {
 				
 				// t(_im2ColOutBlock) %*% t(filter) => t(matMultOutBlock)
 				outMM.reset(outMM.rlen, outMM.clen, false);
-				LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, false, true, _params);
+				LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, false, false, _params);
 				
 				// Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos
 				partialCopyTrans(outMM, _params.output, n*K*PQ, K, PQ);

http://git-wip-us.apache.org/repos/asf/systemml/blob/78a3808e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index 6117b90..92eb79b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -201,7 +201,7 @@ public class LibMatrixDNNHelper {
 		int taskSize = (int)(Math.ceil((double)params.N / k));
 		
 		boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || 
-																(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+			(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params))

[05/50] [abbrv] systemml git commit: [SYSTEMML-1836] Fix unnecessary GC overhead in codegen row operations

Posted by re...@apache.org.

[SYSTEMML-1836] Fix unnecessary GC overhead in codegen row operations

This patch fixes issues of huge garbage collection overhead in special
cases of codegen row operations with relatively large intermediate rows,
matrix side inputs, and especially for multi-threaded operations. The
major problem was that, for aggregations, the temporary output per
thread was allocated as the product of the two temporary vector sizes
(as necessary for matrix multiplication outputs), which however is often
completely unnecessary.

On a scenario of row operations for sum( (X != 0) * (UV - X) ^ 2) over
small inputs of 10K x 3K, this patch improved performance for 10
iterations from 80s (74s GC) to 1.4s (0.074s GC). 


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/586f8229
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/586f8229
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/586f8229

Branch: refs/heads/master
Commit: 586f82299292019583b68dd98ccb8fbb71a2fcd3
Parents: b6b6772
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 21:50:17 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 21:50:17 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/LibSpoofPrimitives.java     | 11 +++++++++--
 .../apache/sysml/runtime/codegen/SpoofRowwise.java    | 14 +++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/586f8229/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 8444b5f..7624d96 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -1829,9 +1829,9 @@ public class LibSpoofPrimitives
 		private int _len2;
 		
 		public VectorBuffer(int num, int len1, int len2) {
-			int lnum = (len2 > 0) ? 2*num : num;
+			int lnum = (len2>0 && len1!=len2) ? 2*num : num;
 			_data = new double[lnum][];
-			for( int i=0; i<num; i++ )
+			for( int i=0; i<num; i++ ) {
 				if( lnum > num ) {
 					_data[2*i] = new double[len1];
 					_data[2*i+1] = new double[len2];
@@ -1839,6 +1839,7 @@ public class LibSpoofPrimitives
 				else {
 					_data[i] = new double[len1];
 				}
+			}
 			_pos = -1;
 			_len1 = len1;
 			_len2 = len2;
@@ -1851,5 +1852,11 @@ public class LibSpoofPrimitives
 			} while( _data[_pos].length!=len );
 			return _data[_pos];
 		}
+		@SuppressWarnings("unused")
+		public boolean isReusable(int num, int len1, int len2) {
+			int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+			return (_len1 == len1 && _len2 == len2
+				&& _data.length == lnum);
+		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/586f8229/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index d8a747b..9d5675b 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -209,8 +209,9 @@ public abstract class SpoofRowwise extends SpoofOperator
 			if( _type.isColumnAgg() || _type == RowType.FULL_AGG ) {
 				//execute tasks
 				ArrayList<ParColAggTask> tasks = new ArrayList<>();
+				int outLen = out.getNumRows() * out.getNumColumns();
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParColAggTask(a, b, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
+					tasks.add(new ParColAggTask(a, b, scalars, n, n2, outLen, i*blklen, Math.min((i+1)*blklen, m)));
 				List<Future<double[]>> taskret = pool.invokeAll(tasks);	
 				//aggregate partial results
 				int len = _type.isColumnAgg() ? out.getNumRows()*out.getNumColumns() : 1;
@@ -343,17 +344,16 @@ public abstract class SpoofRowwise extends SpoofOperator
 		private final MatrixBlock _a;
 		private final SideInput[] _b;
 		private final double[] _scalars;
-		private final int _clen;
-		private final int _clen2;
-		private final int _rl;
-		private final int _ru;
+		private final int _clen, _clen2, _outLen;
+		private final int _rl, _ru;
 
-		protected ParColAggTask( MatrixBlock a, SideInput[] b, double[] scalars, int clen, int clen2, int rl, int ru ) {
+		protected ParColAggTask( MatrixBlock a, SideInput[] b, double[] scalars, int clen, int clen2, int outLen, int rl, int ru ) {
 			_a = a;
 			_b = b;
 			_scalars = scalars;
 			_clen = clen;
 			_clen2 = clen2;
+			_outLen = outLen;
 			_rl = rl;
 			_ru = ru;
 		}
@@ -364,7 +364,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 			//allocate vector intermediates and partial output
 			if( _reqVectMem > 0 )
 				LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
-			double[] c = new double[(_clen2>0)?_clen*_clen2 : _clen];
+			double[] c = new double[_outLen];
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, c, _clen, _rl, _ru);

[33/50] [abbrv] systemml git commit: [SYSTEMML-446] Minimize the chances of eviction for right indexing operation

Posted by re...@apache.org.

[SYSTEMML-446] Minimize the chances of eviction for right indexing
operation

Indexing is only supported on GPU if:
1. the input is of type matrix AND
2. the input is less than 2GB. 

The second condition is added for following reason:
1. Indexing is a purely memory-bound operation and doesnot benefit
drastically from pushing down to GPU.
2. By forcing larger matrices to GPU (for example: training dataset), we
run into risk of unnecessary evictions of parameters and the gradients.
For single precision, there is additional overhead of converting
training dataset to single precision every single time it is evicted.

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/591a0f77
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/591a0f77
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/591a0f77

Branch: refs/heads/master
Commit: 591a0f7754d85e0b9a170cb4385bc84d52e641e8
Parents: f040674
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 21:10:21 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 21:10:21 2017 -0700

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/IndexingOp.java | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/591a0f77/src/main/java/org/apache/sysml/hops/IndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java
index 5989c66..0b8509a 100644
--- a/src/main/java/org/apache/sysml/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java
@@ -102,8 +102,15 @@ public class IndexingOp extends Hop
 			return false;
 		}
 		else {
-			// only matrix indexing is supported on GPU
-			return (getDataType() == DataType.MATRIX);
+			// Indexing is only supported on GPU if:
+			// 1. the input is of type matrix AND
+			// 2. the input is less than 2GB. 
+			// The second condition is added for following reason:
+			// 1. Indexing is a purely memory-bound operation and doesnot benefit drastically from pushing down to GPU.
+			// 2. By forcing larger matrices to GPU (for example: training dataset), we run into risk of unnecessary evictions of 
+			// parameters and the gradients. For single precision, there is additional overhead of converting training dataset 
+			// to single precision every single time it is evicted.
+			return (getDataType() == DataType.MATRIX) && getInputMemEstimate() < 2e+9;
 		}
 	}

[45/50] [abbrv] systemml git commit: [SYSTEMML-1979] Improved codegen cost model (sparsity, minor fixes)

Posted by re...@apache.org.

[SYSTEMML-1979] Improved codegen cost model (sparsity, minor fixes)

This patch improves the codegen cost model to correctly account for the
compute workload of sparse matrix multiplications as well as sparse and
dense input sizes. Furthermore, this patch also includes minor fixes of
eviction- and broadcast-aware cost corrections. Overall, these changes
address special cases of sparse large-scale (i.e., distributed)
scenarios, where the codegen optimizer picked suboptimal plans.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cb1d7928
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cb1d7928
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cb1d7928

Branch: refs/heads/master
Commit: cb1d792826411b144909b2168929c4f33620b02a
Parents: 381d1d6
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 31 22:25:38 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Nov 1 02:25:34 2017 -0700

----------------------------------------------------------------------
 .../opt/PlanSelectionFuseCostBasedV2.java       | 29 ++++++++++++++------
 1 file changed, 21 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/cb1d7928/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index 1f670b3..9302573 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -89,7 +89,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 	private static final double WRITE_BANDWIDTH_IO  =      512*1024*1024;  //512MB/s
 	private static final double WRITE_BANDWIDTH_MEM =  2d*1024*1024*1024;  //2GB/s
 	private static final double READ_BANDWIDTH_MEM  = 32d*1024*1024*1024;  //32GB/s
-	private static final double READ_BANDWIDTH_BROADCAST = WRITE_BANDWIDTH_MEM/4;
+	private static final double READ_BANDWIDTH_BROADCAST = WRITE_BANDWIDTH_IO/4;
 	private static final double COMPUTE_BANDWIDTH  =   2d*1024*1024*1024   //1GFLOPs/core
 								* InfrastructureAnalyzer.getLocalParallelism();
 	
@@ -329,7 +329,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		//get partition input reads (at least read once)
 		for( Long hopID : part.getInputs() ) {
 			Hop hop = memo.getHopRefs().get(hopID);
-			costs += getSize(hop) * 8 / READ_BANDWIDTH_MEM;
+			costs += getSafeMemEst(hop) / READ_BANDWIDTH_MEM;
 		}
 		return costs;
 	}
@@ -355,6 +355,16 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			.mapToDouble(e -> e.getValue()).sum();
 	}
 	
+	private static double sumInputMemoryEstimates(CPlanMemoTable memo, CostVector vect) {
+		return vect.inSizes.keySet().stream()
+			.mapToDouble(e -> getSafeMemEst(memo.getHopRefs().get(e))).sum();
+	}
+	
+	private static double getSafeMemEst(Hop hop) {
+		return !hop.dimsKnown() ? getSize(hop) * 8
+			: hop.getMemEstimate();
+	}
+	
 	private static long getSize(Hop hop) {
 		return Math.max(hop.getDim1(),1) 
 			* Math.max(hop.getDim2(),1);
@@ -603,7 +613,6 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 	
 	private static boolean isRowAggOp(Hop hop){
 		return (hop instanceof AggUnaryOp || hop instanceof AggBinaryOp
-			|| (hop instanceof IndexingOp && HopRewriteUtils.isColumnRangeIndexing((IndexingOp)hop))
 			|| HopRewriteUtils.isBinary(hop, OpOp2.CBIND));
 	}
 	
@@ -840,19 +849,20 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		
 		//add costs for opened fused operator
 		if( opened ) {
+			double memInputs = sumInputMemoryEstimates(memo, costVect);
 			double tmpCosts = costVect.outSize * 8 / WRITE_BANDWIDTH_MEM
-				+ Math.max(costVect.getInputSize() * 8 / READ_BANDWIDTH_MEM,
+				+ Math.max(memInputs / READ_BANDWIDTH_MEM,
 				costVect.computeCosts/ COMPUTE_BANDWIDTH);
 			//read correction for distributed computation
-			Hop driver = memo.getHopRefs().get(costVect.getMaxInputSizeHopID());
-			if( driver.getMemEstimate() > OptimizerUtils.getLocalMemBudget() )
+			if( memInputs > OptimizerUtils.getLocalMemBudget() )
 				tmpCosts += costVect.getSideInputSize() * 8 / READ_BANDWIDTH_BROADCAST;
 			//sparsity correction for outer-product template (and sparse-safe cell)
+			Hop driver = memo.getHopRefs().get(costVect.getMaxInputSizeHopID());
 			if( best != null && best.type == TemplateType.OUTER )
 				tmpCosts *= driver.dimsKnown(true) ? driver.getSparsity() : SPARSE_SAFE_SPARSITY_EST;
 			//write correction for known evictions in CP
-			else if( driver.getMemEstimate() < OptimizerUtils.getLocalMemBudget()
-				&& sumTmpInputOutputSize(memo, costVect) > LazyWriteBuffer.getWriteBufferSize() )
+			else if( memInputs <= OptimizerUtils.getLocalMemBudget()
+				&& sumTmpInputOutputSize(memo, costVect)*8 > LazyWriteBuffer.getWriteBufferSize() )
 				tmpCosts += costVect.outSize * 8 / WRITE_BANDWIDTH_IO;
 			costs += tmpCosts;
 			if( LOG.isTraceEnabled() ) {
@@ -997,6 +1007,8 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			//outer product template w/ matrix-matrix 
 			//or row template w/ matrix-vector or matrix-matrix
 			costs = 2 * current.getInput().get(0).getDim2();
+			if( current.getInput().get(0).dimsKnown(true) )
+				costs *= current.getInput().get(0).getSparsity();
 		}
 		else if( current instanceof AggUnaryOp) {
 			switch(((AggUnaryOp)current).getOp()) {
@@ -1048,6 +1060,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			//ensures that input sizes are not double counted
 			inSizes.put(hopID, inputSize);
 		}
+		@SuppressWarnings("unused")
 		public double getInputSize() {
 			return inSizes.values().stream()
 				.mapToDouble(d -> d.doubleValue()).sum();

[09/50] [abbrv] systemml git commit: [MINOR] [SYSTEMML-540] Reset the _cachedParams to avoid incorrect sizes

Posted by re...@apache.org.

[MINOR] [SYSTEMML-540] Reset the _cachedParams to avoid incorrect sizes

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/259814e6
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/259814e6
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/259814e6

Branch: refs/heads/master
Commit: 259814e6c00021c643c33867906d0c5d8dc4bc5e
Parents: 3a7f38e
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Tue Oct 17 16:52:19 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Tue Oct 17 16:52:19 2017 -0700

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/ConvolutionOp.java | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/259814e6/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index e4ed32b..50a7ca3 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -639,6 +639,9 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			return;
 		}
 		
+		// Reset the _cachedParams to avoid incorrect sizes
+		_cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
+		
 		switch(op) 
 		{
 			case MAX_POOLING:

[38/50] [abbrv] systemml git commit: [SYSTEMML-1977] Fix codegen spark row ops w/ multiple rdd inputs

Posted by re...@apache.org.

[SYSTEMML-1977] Fix codegen spark row ops w/ multiple rdd inputs

This patch fixes special cases of distributed codegen spark row
operations with multiple rdd inputs (i.e., in case of large side inputs
that cannot be broadcast). We now handle the meta data management at the
driver which removes this implicit assumption that relevant inputs for
B1 row types are available as broadcasts.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/118e3c0f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/118e3c0f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/118e3c0f

Branch: refs/heads/master
Commit: 118e3c0f630d3a3b30755ecb712672d79f8b8d7c
Parents: ede870d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 27 23:13:54 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Oct 27 23:13:54 2017 -0700

----------------------------------------------------------------------
 .../instructions/spark/SpoofSPInstruction.java    | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/118e3c0f/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
index b34afad..eb74fed 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
@@ -166,7 +166,7 @@ public class SpoofSPInstruction extends SPInstruction {
 		}
 		else if(_class.getSuperclass() == SpoofMultiAggregate.class) //MAGG
 		{
-			SpoofMultiAggregate op = (SpoofMultiAggregate) CodegenUtils.createInstance(_class); 	
+			SpoofMultiAggregate op = (SpoofMultiAggregate) CodegenUtils.createInstance(_class);
 			AggOp[] aggOps = op.getAggOps();
 			
 			MatrixBlock tmpMB = in.mapToPair(new MultiAggregateFunction(
@@ -178,7 +178,7 @@ public class SpoofSPInstruction extends SPInstruction {
 		else if(_class.getSuperclass() == SpoofOuterProduct.class) //OUTER
 		{
 			if( _out.getDataType()==DataType.MATRIX ) {
-				SpoofOperator op = (SpoofOperator) CodegenUtils.createInstance(_class); 	
+				SpoofOperator op = (SpoofOperator) CodegenUtils.createInstance(_class);
 				OutProdType type = ((SpoofOuterProduct)op).getOuterProdType();
 
 				//update matrix characteristics
@@ -211,9 +211,11 @@ public class SpoofSPInstruction extends SPInstruction {
 				throw new DMLRuntimeException("Invalid spark rowwise operator w/ ncol=" + 
 					mcIn.getCols()+", ncolpb="+mcIn.getColsPerBlock()+".");
 			}
-			SpoofRowwise op = (SpoofRowwise) CodegenUtils.createInstance(_class); 	
+			SpoofRowwise op = (SpoofRowwise) CodegenUtils.createInstance(_class);
+			long clen2 = (op.getRowType()==RowType.NO_AGG_CONST) ? op.getConstDim2() :
+				op.getRowType().isRowTypeB1() ? sec.getMatrixCharacteristics(_in[1].getName()).getCols() : -1;
 			RowwiseFunction fmmc = new RowwiseFunction(_class.getName(),
-				_classBytes, bcVect2, bcMatrices, scalars, (int)mcIn.getCols());
+				_classBytes, bcVect2, bcMatrices, scalars, (int)mcIn.getCols(), (int)clen2);
 			out = in.mapPartitionsToPair(fmmc, op.getRowType()==RowType.ROW_AGG
 					|| op.getRowType() == RowType.NO_AGG);
 			
@@ -434,13 +436,15 @@ public class SpoofSPInstruction extends SPInstruction {
 		private static final long serialVersionUID = -7926980450209760212L;
 
 		private final int _clen;
+		private final int _clen2;
 		private SpoofRowwise _op = null;
 		
-		public RowwiseFunction(String className, byte[] classBytes, boolean[] bcInd, ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices, ArrayList<ScalarObject> scalars, int clen) 
+		public RowwiseFunction(String className, byte[] classBytes, boolean[] bcInd, ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices, ArrayList<ScalarObject> scalars, int clen, int clen2) 
 			throws DMLRuntimeException
 		{			
 			super(className, classBytes, bcInd, bcMatrices, scalars);
 			_clen = clen;
+			_clen2 = clen;
 		}
 		
 		@Override
@@ -454,9 +458,7 @@ public class SpoofSPInstruction extends SPInstruction {
 			}
 			
 			//setup local memory for reuse
-			int clen2 = (int) ((_op.getRowType()==RowType.NO_AGG_CONST) ? _op.getConstDim2() :
-				_op.getRowType().isRowTypeB1() ? _inputs.get(0).getNumCols() : -1);
-			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen, clen2);
+			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen, _clen2);
 			
 			ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<>();
 			boolean aggIncr = (_op.getRowType().isColumnAgg() //aggregate entire partition

[49/50] [abbrv] systemml git commit: [SYSTEMML-1983] New codegen cplan rewrite framework (micro optims)

Posted by re...@apache.org.

[SYSTEMML-1983] New codegen cplan rewrite framework (micro optims)

This patch refactors the code generator by moving smaller micro
optimizations from the templates into a new cplan rewrite framework to
avoid redundancy and inconsistency across templates and to improve
debuggability. The goal is NOT to provide a fusion-aware rewrite
framework, but simply to apply smaller rewrites for better code quality.
An example rewrite is rowSums(X!=0) -> rowNnzs(X), which avoids an
unnecessary row intermediate and is realized as a meta data operation
for sparse input rows.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e1f5866a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e1f5866a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e1f5866a

Branch: refs/heads/master
Commit: e1f5866a5c2e2100d68124b74a33a9022e89dd09
Parents: ee6060b
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Nov 2 00:15:11 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Nov 2 00:39:18 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |  43 +-----
 .../hops/codegen/template/CPlanOpRewriter.java  | 130 +++++++++++++++++++
 .../hops/codegen/template/TemplateCell.java     |  11 +-
 .../codegen/template/TemplateOuterProduct.java  |   9 +-
 .../hops/codegen/template/TemplateRow.java      |  10 --
 .../functions/codegen/RowAggTmplTest.java       |  20 ++-
 .../scripts/functions/codegen/rowAggPattern34.R |  32 +++++
 .../functions/codegen/rowAggPattern34.dml       |  29 +++++
 8 files changed, 219 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 4af8540..51cd0a2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -39,7 +39,6 @@ import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.cplan.CNode;
-import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysml.hops.codegen.cplan.CNodeCell;
 import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg;
@@ -53,7 +52,6 @@ import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseCostBased;
 import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseCostBasedV2;
 import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseNoRedundancy;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.TemplateBase;
 import org.apache.sysml.hops.codegen.template.TemplateBase.CloseType;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
@@ -61,6 +59,7 @@ import org.apache.sysml.hops.codegen.template.CPlanCSERewriter;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntrySet;
+import org.apache.sysml.hops.codegen.template.CPlanOpRewriter;
 import org.apache.sysml.hops.codegen.template.TemplateUtils;
 import org.apache.sysml.hops.recompile.RecompileStatus;
 import org.apache.sysml.hops.recompile.Recompiler;
@@ -68,7 +67,6 @@ import org.apache.sysml.hops.AggUnaryOp;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.OpOp1;
 import org.apache.sysml.hops.HopsException;
-import org.apache.sysml.hops.LiteralOp;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.rewrite.ProgramRewriteStatus;
@@ -684,13 +682,15 @@ public class SpoofCompiler
 	private static HashMap<Long, Pair<Hop[],CNodeTpl>> cleanupCPlans(CPlanMemoTable memo, HashMap<Long, Pair<Hop[],CNodeTpl>> cplans) 
 	{
 		HashMap<Long, Pair<Hop[],CNodeTpl>> cplans2 = new HashMap<>();
+		CPlanOpRewriter rewriter = new CPlanOpRewriter();
 		CPlanCSERewriter cse = new CPlanCSERewriter();
 		
 		for( Entry<Long, Pair<Hop[],CNodeTpl>> e : cplans.entrySet() ) {
 			CNodeTpl tpl = e.getValue().getValue();
 			Hop[] inHops = e.getValue().getKey();
 			
-			//perform common subexpression elimination
+			//perform simplifications and cse rewrites
+			tpl = rewriter.simplifyCPlan(tpl);
 			tpl = cse.eliminateCommonSubexpressions(tpl);
 			
 			//update input hops (order-preserving)
@@ -727,10 +727,6 @@ public class SpoofCompiler
 			else
 				rFindAndRemoveLookup(tpl.getOutput(), in1, !(tpl instanceof CNodeRow));
 			
-			//remove unnecessary neq 0 on main input of outer template
-			if( tpl instanceof CNodeOuterProduct )
-				rFindAndRemoveBinaryMS(tpl.getOutput(), in1, BinType.NOTEQUAL, "0", "1");
-			
 			//remove invalid row templates (e.g., unsatisfied blocksize constraint)
 			if( tpl instanceof CNodeRow ) {
 				//check for invalid row cplan over column vector
@@ -810,37 +806,6 @@ public class SpoofCompiler
 		}
 	}
 	
-	@SuppressWarnings("unused")
-	private static void rFindAndRemoveUnary(CNode node, CNodeData mainInput, UnaryType type) {
-		for( int i=0; i<node.getInput().size(); i++ ) {
-			CNode tmp = node.getInput().get(i);
-			if( TemplateUtils.isUnary(tmp, type) && tmp.getInput().get(0) instanceof CNodeData
-				&& ((CNodeData)tmp.getInput().get(0)).getHopID()==mainInput.getHopID() )
-			{
-				node.getInput().set(i, tmp.getInput().get(0));
-			}
-			else
-				rFindAndRemoveUnary(tmp, mainInput, type);
-		}
-	}
-	
-	private static void rFindAndRemoveBinaryMS(CNode node, CNodeData mainInput, BinType type, String lit, String replace) {
-		for( int i=0; i<node.getInput().size(); i++ ) {
-			CNode tmp = node.getInput().get(i);
-			if( TemplateUtils.isBinary(tmp, type) && tmp.getInput().get(1).isLiteral()
-				&& tmp.getInput().get(1).getVarname().equals(lit)
-				&& tmp.getInput().get(0) instanceof CNodeData
-				&& ((CNodeData)tmp.getInput().get(0)).getHopID()==mainInput.getHopID() )
-			{
-				CNodeData cnode = new CNodeData(new LiteralOp(replace));
-				cnode.setLiteral(true);
-				node.getInput().set(i, cnode);
-			}
-			else
-				rFindAndRemoveBinaryMS(tmp, mainInput, type, lit, replace);
-		}
-	}
-	
 	private static boolean rHasLookupRC1(CNode node, CNodeData mainInput, boolean includeRC1) {
 		boolean ret = false;
 		for( int i=0; i<node.getInput().size() && !ret; i++ ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanOpRewriter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanOpRewriter.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanOpRewriter.java
new file mode 100644
index 0000000..8ec750c
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanOpRewriter.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg;
+import org.apache.sysml.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+
+/**
+ * This cplan rewriter is meant to be the central place for any cplan 
+ * enhancements before code generation. These rewrites do not aim to
+ * handle reorderings or other algebraic simplifications but rather
+ * focus on low-level simplifications to produce better code, while
+ * keeping the cplan construction of the individual templates clean
+ * and without unnecessary redundancy.
+ * 
+ * Assumption: This rewriter should be called before CSE as these
+ * rewrites potentially destroy common subexpressions.
+ */
+public class CPlanOpRewriter 
+{
+	public CNodeTpl simplifyCPlan(CNodeTpl tpl) {
+		//apply template specific rewrites
+		tpl = rewriteRemoveOuterNeq0(tpl); // Outer(a!=0) -> Outer(1)
+		
+		//apply operation specific rewrites
+		if( tpl instanceof CNodeMultiAgg ) {
+			ArrayList<CNode> outputs = ((CNodeMultiAgg)tpl).getOutputs();
+			for( int i=0; i< outputs.size(); i++ )
+				outputs.set(i, rSimplifyCNode(outputs.get(i)));
+		}
+		else {
+			tpl.setOutput(rSimplifyCNode(tpl.getOutput()));
+		}
+		
+		return tpl;
+	}
+	
+	private static CNode rSimplifyCNode(CNode node) {
+		//process children recursively
+		for(int i=0; i<node.getInput().size(); i++)
+			node.getInput().set(i, rSimplifyCNode(node.getInput().get(i)));
+		
+		//apply all node-local simplification rewrites
+		node = rewriteRowCountNnz(node); //rowSums(X!=0) -> rowNnz(X)
+		node = rewriteRowSumSq(node);    //rowSums(X^2) -> rowSumSqs(X)
+		node = rewriteBinaryPow2(node);  //x^2 -> x*x
+		node = rewriteBinaryMult2(node); //x*2 -> x+x;
+		return node;
+	}
+	
+	private static CNode rewriteRowCountNnz(CNode node) {
+		return (TemplateUtils.isUnary(node, UnaryType.ROW_SUMS)
+			&& TemplateUtils.isBinary(node.getInput().get(0), BinType.VECT_NOTEQUAL_SCALAR)
+			&& node.getInput().get(0).getInput().get(1).isLiteral()
+			&& node.getInput().get(0).getInput().get(1).getVarname().equals("0")) ?
+			new CNodeUnary(node.getInput().get(0).getInput().get(0), UnaryType.ROW_COUNTNNZS) : node;
+	}
+	
+	private static CNode rewriteRowSumSq(CNode node) {
+		return (TemplateUtils.isUnary(node, UnaryType.ROW_SUMS)
+			&& TemplateUtils.isBinary(node.getInput().get(0), BinType.VECT_POW_SCALAR)
+			&& node.getInput().get(0).getInput().get(1).isLiteral()
+			&& node.getInput().get(0).getInput().get(1).getVarname().equals("2")) ?
+			new CNodeUnary(node.getInput().get(0).getInput().get(0), UnaryType.ROW_SUMSQS) : node;
+	}
+
+	private static CNode rewriteBinaryPow2(CNode node) {
+		return (TemplateUtils.isBinary(node, BinType.POW) 
+			&& node.getInput().get(1).isLiteral()
+			&& node.getInput().get(1).getVarname().equals("2")) ?
+			new CNodeUnary(node.getInput().get(0), UnaryType.POW2) : node;
+	}
+	
+	private static CNode rewriteBinaryMult2(CNode node) {
+		return (TemplateUtils.isBinary(node, BinType.MULT) 
+			&& node.getInput().get(1).isLiteral()
+			&& node.getInput().get(1).getVarname().equals("2")) ?
+			new CNodeUnary(node.getInput().get(0), UnaryType.MULT2) : node;
+	}
+	
+	private static CNodeTpl rewriteRemoveOuterNeq0(CNodeTpl tpl) {
+		if( tpl instanceof CNodeOuterProduct )
+			rFindAndRemoveBinaryMS(tpl.getOutput(), (CNodeData)
+				tpl.getInput().get(0), BinType.NOTEQUAL, "0", "1");
+		return tpl;
+	}
+	
+	private static void rFindAndRemoveBinaryMS(CNode node, CNodeData mainInput, BinType type, String lit, String replace) {
+		for( int i=0; i<node.getInput().size(); i++ ) {
+			CNode tmp = node.getInput().get(i);
+			if( TemplateUtils.isBinary(tmp, type) && tmp.getInput().get(1).isLiteral()
+				&& tmp.getInput().get(1).getVarname().equals(lit)
+				&& tmp.getInput().get(0) instanceof CNodeData
+				&& ((CNodeData)tmp.getInput().get(0)).getHopID()==mainInput.getHopID() )
+			{
+				CNodeData cnode = new CNodeData(new LiteralOp(replace));
+				cnode.setLiteral(true);
+				node.getInput().set(i, cnode);
+			}
+			else
+				rFindAndRemoveBinaryMS(tmp, mainInput, type, lit, replace);
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index 4f3d4f4..fe5a1e7 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -195,12 +195,9 @@ public class TemplateCell extends TemplateBase
 			cdata1 = TemplateUtils.wrapLookupIfNecessary(cdata1, hop.getInput().get(0));
 			cdata2 = TemplateUtils.wrapLookupIfNecessary(cdata2, hop.getInput().get(1));
 			
-			if( bop.getOp()==OpOp2.POW && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-				out = new CNodeUnary(cdata1, UnaryType.POW2);
-			else if( bop.getOp()==OpOp2.MULT && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-				out = new CNodeUnary(cdata1, UnaryType.MULT2);
-			else //default binary	
-				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
+			//construct binary cnode
+			out = new CNodeBinary(cdata1, cdata2, 
+				BinType.valueOf(primitiveOpName));
 		}
 		else if(hop instanceof TernaryOp) 
 		{
@@ -215,7 +212,7 @@ public class TemplateCell extends TemplateBase
 			
 			//construct ternary cnode, primitive operation derived from OpOp3
 			out = new CNodeTernary(cdata1, cdata2, cdata3, 
-					TernaryType.valueOf(top.getOp().name()));
+				TernaryType.valueOf(top.getOp().name()));
 		}
 		else if( hop instanceof ParameterizedBuiltinOp ) 
 		{

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
index 256f540..188bac2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
@@ -174,7 +174,6 @@ public class TemplateOuterProduct extends TemplateBase {
 		}
 		else if(hop instanceof BinaryOp)
 		{
-			BinaryOp bop = (BinaryOp) hop;
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
 			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
 			String primitiveOpName = ((BinaryOp)hop).getOp().toString();
@@ -187,12 +186,8 @@ public class TemplateOuterProduct extends TemplateBase {
 			//add lookups if required
 			cdata1 = TemplateUtils.wrapLookupIfNecessary(cdata1, hop.getInput().get(0));
 			cdata2 = TemplateUtils.wrapLookupIfNecessary(cdata2, hop.getInput().get(1));
-			if( bop.getOp()==OpOp2.POW && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-				out = new CNodeUnary(cdata1, UnaryType.POW2);
-			else if( bop.getOp()==OpOp2.MULT && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-				out = new CNodeUnary(cdata1, UnaryType.MULT2);
-			else
-				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
+			
+			out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
 		}
 		else if(hop instanceof AggBinaryOp)
 		{

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index b862abf..dc08dbf 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -257,16 +257,6 @@ public class TemplateRow extends TemplateBase
 			if( ((AggUnaryOp)hop).getDirection() == Direction.Row && HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
 				if(hop.getInput().get(0).getDim2()==1)
 					out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP_R);
-				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM)
-					&& HopRewriteUtils.isBinaryMatrixScalar(hop.getInput().get(0), OpOp2.NOTEQUAL, 0)
-					&& cdata1 instanceof CNodeBinary ) {
-					out = new CNodeUnary(cdata1.getInput().get(0), UnaryType.ROW_COUNTNNZS);
-				}
-				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM)
-					&& HopRewriteUtils.isBinaryMatrixScalar(hop.getInput().get(0), OpOp2.POW, 2)
-					&& cdata1 instanceof CNodeBinary ) {
-					out = new CNodeUnary(cdata1.getInput().get(0), UnaryType.ROW_SUMSQS);
-				}
 				else {
 					String opcode = "ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
 					out = new CNodeUnary(cdata1, UnaryType.valueOf(opcode));

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 5d2015f..b5426ae 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -70,6 +70,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME31 = TEST_NAME+"31"; //MLogreg - matrix-vector cbind 0s generalized
 	private static final String TEST_NAME32 = TEST_NAME+"32"; //X[, 1] - rowSums(X)
 	private static final String TEST_NAME33 = TEST_NAME+"33"; //Kmeans, inner loop
+	private static final String TEST_NAME34 = TEST_NAME+"34"; //X / rowSums(X!=0)
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -81,7 +82,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=33; i++)
+		for(int i=1; i<=34; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -580,6 +581,21 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME33, false, ExecType.SPARK );
 	}
 	
+	@Test
+	public void testCodegenRowAggRewrite34CP() {
+		testCodegenIntegration( TEST_NAME34, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg34CP() {
+		testCodegenIntegration( TEST_NAME34, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg34SP() {
+		testCodegenIntegration( TEST_NAME34, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -601,7 +617,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 			
 			String HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = HOME + testname + ".dml";
-			programArgs = new String[]{"-explain", "recompile_hops", "-stats", "-args", output("S") };
+			programArgs = new String[]{"-explain", "recompile_runtime", "-stats", "-args", output("S") };
 			
 			fullRScriptName = HOME + testname + ".R";
 			rCmd = getRCmd(inputDir(), expectedDir());			

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/test/scripts/functions/codegen/rowAggPattern34.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern34.R b/src/test/scripts/functions/codegen/rowAggPattern34.R
new file mode 100644
index 0000000..2deea5d
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern34.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 300, 20, byrow=TRUE);
+X[,6:20] = matrix(0, 300, 15);
+
+R = X / rowSums(X!=0);
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/e1f5866a/src/test/scripts/functions/codegen/rowAggPattern34.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern34.dml b/src/test/scripts/functions/codegen/rowAggPattern34.dml
new file mode 100644
index 0000000..12d9b7f
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern34.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 300, 20);
+X[,6:20] = matrix(0, 300, 15);
+while(FALSE){}
+
+R = X / rowSums(X!=0);
+
+write(R, $1)

[07/50] [abbrv] systemml git commit: [SYSTEMML-540] Reduce the number of unknowns in ConvolutionOp

Posted by re...@apache.org.

[SYSTEMML-540] Reduce the number of unknowns in ConvolutionOp

- This commit reduces the unknowns during dynamic recompilation by inferring the
input's height/width of ConvolutionOp based on its parent's output's
height/width.
- Additionally, for developer debugging, I have guarded the functionality
with the flag INFER_TENSOR_SHAPE_FROM_PARENT_CONV_OP and have added
sufficient documentation to explain how these dimensions are inferred.

Closes #685.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5adb330d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5adb330d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5adb330d

Branch: refs/heads/master
Commit: 5adb330deffa5479475338316bf47193d0c31da4
Parents: 2ca2d8a
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Mon Oct 16 15:44:37 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Mon Oct 16 15:45:39 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 170 ++++++++++++++++---
 1 file changed, 144 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5adb330d/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index e732fb8..e4ed32b 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -32,11 +32,21 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
-
 import java.util.ArrayList;
 
 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {	
+	// -------------------------------------------------------------------------
+	// This flag allows us to compile plans with less unknowns and also serves as future tensorblock integration.
+	// By default, these flags are turned on.
+	
+	// When this flag is turned on, we attempt to check the parent convolution hop for unknown dimensions.
+	// For example: in case of conv -> maxpool, the input channel/height/width of maxpool will match output channel/height/width of conv.
+	private static final boolean INFER_TENSOR_SHAPE_FROM_PARENT_CONV_OP = true;
+	// This guards us from cases where the user provides incorrect C,H,W parameters.
+	private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = true;
+	// -------------------------------------------------------------------------
+	
 	private Hop.ConvOp op;
 
 	private int _maxNumThreads = -1; //-1 for unlimited
@@ -475,17 +485,21 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	// input_shape1, input_shape2, input_shape3, input_shape4, 
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
 	ConvolutionParameters parseInput() throws DMLRuntimeException {
+		
+		Hop imageHeightHop = null; Hop filterHeightHop = null;
 		if(op == ConvOp.MAX_POOLING_BACKWARD 
 				|| op == ConvOp.DIRECT_CONV2D 
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
 				|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+			imageHeightHop = getInput().get(8);
+			filterHeightHop = getInput().get(12);
 			_cachedParams.setIfUnknown(
 					getInput().get(6),
 					getInput().get(7), 
-					getInput().get(8), 
+					imageHeightHop, 
 					getInput().get(9), 
 					getInput().get(10), 
-					getInput().get(12), 
+					filterHeightHop, 
 					getInput().get(13), 
 					getInput().get(2), 
 					getInput().get(3), 
@@ -493,22 +507,127 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 					getInput().get(5), _maxNumThreads);
 		}
 		else {
+			imageHeightHop = getInput().get(7);
+			filterHeightHop = getInput().get(11);
 			_cachedParams.setIfUnknown(
 					getInput().get(5),
 					getInput().get(6), 
-					getInput().get(7), 
+					imageHeightHop, 
 					getInput().get(8), 
 					getInput().get(9), 
-					getInput().get(11), 
+					filterHeightHop, 
 					getInput().get(12), 
 					getInput().get(1), 
 					getInput().get(2), 
 					getInput().get(3), 
 					getInput().get(4), _maxNumThreads);
 		}
+		
+		if(INFER_TENSOR_SHAPE_FROM_PARENT_CONV_OP) {
+			boolean isMaxPool = getOp() == ConvOp.MAX_POOLING;
+			boolean isConv = getOp() == ConvOp.DIRECT_CONV2D;
+			boolean unknownCHWPQ = _cachedParams.C < 0 || _cachedParams.H < 0 || _cachedParams.W < 0 || _cachedParams.P < 0 || _cachedParams.Q < 0;
+			if((isMaxPool || isConv) && unknownCHWPQ) {
+				// Only infer input shape for convolution and maxpool
+				inferCHWPQFromParentOp();
+			}
+		}
+		
+		if(imageHeightHop == filterHeightHop && _cachedParams.R < 0 && _cachedParams.H > 0) {
+			// Unknown R, but known H and both are equal
+			// This happens for one-dimensional conv2d where H=R and H can be inferred from the parent hop
+			_cachedParams.R = _cachedParams.H;
+		}
+		
+		// Compute P and Q if unknown. At script level, they are computed using following script:
+		// P = as.integer(floor((H + 2*pad_h - R)/stride_h + 1))
+		// Q = as.integer(floor((W + 2*pad_w - S)/stride_w + 1))
+		if(_cachedParams.P < 0 && _cachedParams.H >= 0 && _cachedParams.R >= 0 && _cachedParams.stride_h >= 0 && _cachedParams.pad_h >= 0) {
+			_cachedParams.P = (int) org.apache.sysml.runtime.util.ConvolutionUtils.getP(_cachedParams.H, _cachedParams.R, _cachedParams.stride_h, _cachedParams.pad_h);
+		}
+		if(_cachedParams.Q < 0 && _cachedParams.W >= 0 && _cachedParams.S >= 0 && _cachedParams.stride_w >= 0 && _cachedParams.pad_w >= 0) {
+			_cachedParams.Q = (int) org.apache.sysml.runtime.util.ConvolutionUtils.getQ(_cachedParams.W, _cachedParams.S, _cachedParams.stride_w, _cachedParams.pad_w);
+		}
+		
 		return _cachedParams;
 	}
 	
+	/**
+	 * Utility method to check if the given hop is a BIAS_ADD hop
+	 * 
+	 * @param hop the given hop
+	 * @return true if the given hop is BIAS_ADD
+	 */
+	private static boolean isInputBiasAdd(Hop hop) {
+		if(hop instanceof ConvolutionOp && ((ConvolutionOp) hop).getOp() == ConvOp.BIAS_ADD) {
+			return true;
+		}
+		return false;
+	}
+	
+	/**
+	 * Utility method to check if the inferred shapes are equal to the given shape with a guard for unknown
+	 * 
+	 * @param dim1 inferred shape
+	 * @param dim2 given shape
+	 * @param paramType string denoting the parameter for pretty printing of the error message
+	 * @throws DMLRuntimeException if dim1 != dim2
+	 */
+	private void throwExceptionIfNotEqual(int dim1, int dim2, String paramType) throws DMLRuntimeException {
+		if(dim1 >= 0 && dim2 >= 0 && dim1 != dim2) {
+			throw new DMLRuntimeException("Inferred " + paramType + " from parent doesn't match with given " + paramType + ":" + dim1 + " != " + dim2);
+		}
+	}
+	
+	/**
+	 * Gets the values for the parameters C, H, W, P, Q from parent hops
+	 * 
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	private void inferCHWPQFromParentOp() throws DMLRuntimeException {Hop tmp = getInput().get(0);
+		while(isInputReLU(tmp) || isInputBiasAdd(tmp)) {
+			// Skip ReLU and bias_add and go to its parent
+			tmp = tmp.getInput().get(0);
+		}
+		// Cast tmp as parent
+		ConvolutionOp parentOp = (tmp instanceof ConvolutionOp) ? ((ConvolutionOp) tmp) : null; 
+		
+		if(parentOp == null)
+			return;
+		else if(parentOp.getOp() == ConvOp.MAX_POOLING) {
+			ConvolutionParameters parentParam = parentOp.parseInput();
+			int prevC = _cachedParams.C; int prevH = _cachedParams.H; int prevW = _cachedParams.W;
+			// [C, P, Q] from maxpool becomes [C, H, W] of next op
+			_cachedParams.C = (_cachedParams.C < 0) ? parentParam.C : _cachedParams.C;
+			_cachedParams.H = (_cachedParams.H < 0) ? parentParam.P : _cachedParams.H;
+			_cachedParams.W = (_cachedParams.W < 0) ? parentParam.Q : _cachedParams.W;
+			if(LOG.isDebugEnabled()) {
+				LOG.debug("Inferring [C,H,W] from maxpool parent: [" + prevC + "," + prevH + "," + prevW + "]-> [" + _cachedParams.C + "," + _cachedParams.H + "," + _cachedParams.W + "]");
+			}
+			if(THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH) {
+				throwExceptionIfNotEqual(prevC, _cachedParams.C, "C");
+				throwExceptionIfNotEqual(prevH, _cachedParams.H, "H");
+				throwExceptionIfNotEqual(prevW, _cachedParams.W, "W");
+			}
+		}
+		else if(parentOp.getOp() == ConvOp.DIRECT_CONV2D) {
+			ConvolutionParameters parentParam = parentOp.parseInput();
+			int prevC = _cachedParams.C; int prevH = _cachedParams.H; int prevW = _cachedParams.W;
+			// [K, P, Q] from convolution becomes [C, H, W] of next op
+			_cachedParams.C = (_cachedParams.C < 0) ? parentParam.K : _cachedParams.C;
+			_cachedParams.H = (_cachedParams.H < 0) ? parentParam.P : _cachedParams.H;
+			_cachedParams.W = (_cachedParams.W < 0) ? parentParam.Q : _cachedParams.W;
+			if(LOG.isDebugEnabled()) {
+				LOG.debug("Inferring [C,H,W] from maxpool parent: [" + prevC + "," + prevH + "," + prevW + "]-> [" + _cachedParams.C + "," + _cachedParams.H + "," + _cachedParams.W + "]");
+			}
+			if(THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH) {
+				throwExceptionIfNotEqual(prevC, _cachedParams.C, "C");
+				throwExceptionIfNotEqual(prevH, _cachedParams.H, "H");
+				throwExceptionIfNotEqual(prevW, _cachedParams.W, "W");
+			}
+		}
+	}
+	
 	@Override
 	public void refreshSizeInformation()
 	{
@@ -620,9 +739,8 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
 			throw new RuntimeException("getDim method should not be invoked for bias_add and bias_multiply");
 		}
-		ConvolutionParameters params;
 		try {
-			params = parseInput();
+			parseInput();
 		} catch (DMLRuntimeException e) {
 			throw new RuntimeException(e);
 		}
@@ -653,49 +771,49 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		long ret = -1;
 		if(dimString.equals("K") && filter != null) {
-			ret = getNonNegative(ret, getNonNegative(params.K, filter._dim1));
+			ret = getNonNegative(ret, getNonNegative(_cachedParams.K, filter._dim1));
 		}
 		else if(dimString.equals("CRS") && filter != null) {
-			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), filter._dim2));
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(_cachedParams.C, _cachedParams.R, _cachedParams.S), filter._dim2));
 		}
 		else if(dimString.equals("N") && input != null) {
-			ret = getNonNegative(ret, getNonNegative(params.N, input._dim1));
+			ret = getNonNegative(ret, getNonNegative(_cachedParams.N, input._dim1));
 		}
 		else if(dimString.equals("CHW") && input != null) {
-			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2));
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(_cachedParams.C, _cachedParams.H, _cachedParams.W), input._dim2));
 		}
 		else if(dimString.equals("N") && dout != null) {
-			ret = getNonNegative(ret, getNonNegative(params.N, dout._dim1));
+			ret = getNonNegative(ret, getNonNegative(_cachedParams.N, dout._dim1));
 		}
 		else if(dimString.equals("KPQ") && dout != null) {
-			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2));
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(_cachedParams.K, _cachedParams.P, _cachedParams.Q), dout._dim2));
 		}
 		else if(dimString.equals("N") && dout1 != null) {
-			ret = getNonNegative(ret, getNonNegative(params.N, dout1._dim1));
+			ret = getNonNegative(ret, getNonNegative(_cachedParams.N, dout1._dim1));
 		}
 		else if(dimString.equals("CPQ") && dout1 != null) {
-			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2));
+			ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(_cachedParams.C, _cachedParams.P, _cachedParams.Q), dout1._dim2));
 		}
 		else if(dimString.equals("K")) {
-			ret = getNonNegative(ret, params.K >= 0 ? params.K : -1);
+			ret = getNonNegative(ret, _cachedParams.K >= 0 ? _cachedParams.K : -1);
 		}
 		else if(dimString.equals("CRS")) {
-			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.R, params.S));
+			ret = getNonNegative(ret, nonNegativeMultiply(_cachedParams.C, _cachedParams.R, _cachedParams.S));
 		}
 		else if(dimString.equals("N")) {
-			ret = getNonNegative(ret, params.N >= 0 ? params.N : -1);
+			ret = getNonNegative(ret, _cachedParams.N >= 0 ? _cachedParams.N : -1);
 		}
 		else if(dimString.equals("CHW")) {
-			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.H, params.W));
+			ret = getNonNegative(ret, nonNegativeMultiply(_cachedParams.C, _cachedParams.H, _cachedParams.W));
 		}
 		else if(dimString.equals("KPQ")) {
-			ret = getNonNegative(ret, nonNegativeMultiply(params.K, params.P, params.Q));
+			ret = getNonNegative(ret, nonNegativeMultiply(_cachedParams.K, _cachedParams.P, _cachedParams.Q));
 		}
 		else if(dimString.equals("PQ")) {
-			ret = getNonNegative(ret, nonNegativeMultiply(params.P, params.Q));
+			ret = getNonNegative(ret, nonNegativeMultiply(_cachedParams.P, _cachedParams.Q));
 		}
 		else if(dimString.equals("CPQ")) {
-			ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.P, params.Q));
+			ret = getNonNegative(ret, nonNegativeMultiply(_cachedParams.C, _cachedParams.P, _cachedParams.Q));
 		}
 		else {
 			throw new RuntimeException("Unsupported dimension:" + dimString + " for operator " + getOp().name());
@@ -703,10 +821,10 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		if(LOG.isDebugEnabled() && ret < 0) {
 			LOG.debug("Unknown dimension " + dimString + " for ConvolutionOp:" + op.name() + 
-					" img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
-					" filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" + 
-					" output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
-					" pad=[" + params.pad_h + " " + params.pad_w + "]");
+					" img_dim=[" + _cachedParams.N + " " + _cachedParams.C + " " + _cachedParams.H + " " + _cachedParams.W + "]" +
+					" filter_dim=[" + _cachedParams.K + " " + _cachedParams.C + " " + _cachedParams.R + " " + _cachedParams.S + "]" + 
+					" output_feature_map=[" + _cachedParams.P + " " + _cachedParams.Q + "] stride=[" + _cachedParams.stride_h + " " + _cachedParams.stride_w + "]" +
+					" pad=[" + _cachedParams.pad_h + " " + _cachedParams.pad_w + "]");
 		}
 		return ret;
 	}

[28/50] [abbrv] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by re...@apache.org.

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index ba447cf..4da874e 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -163,6 +163,7 @@ public class DMLScript
 	public static boolean           ENABLE_DEBUG_MODE   = DMLOptions.defaultOptions.debug;       // debug mode
 	public static ExplainType       EXPLAIN             = DMLOptions.defaultOptions.explainType; // explain type
 	public static String            DML_FILE_PATH_ANTLR_PARSER = DMLOptions.defaultOptions.filePath; // filename of dml/pydml script
+	public static String            FLOATING_POINT_PRECISION = "double"; 							// data type to use internally
 
 	/**
 	 * Global variable indicating the script type (DML or PYDML). Can be used

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index a49ffda..51ab6a1 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -81,6 +81,10 @@ public class ScriptExecutorUtils {
 		DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
 		DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
 		DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);
+		if(DMLScript.USE_ACCELERATOR) {
+			DMLScript.FLOATING_POINT_PRECISION = dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION);
+			org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision();
+		}
 
 		boolean exceptionThrown = false;
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 0b73ab0..e8bde56 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -92,6 +92,7 @@ public class DMLConfig
 	// Fraction of available memory to use. The available memory is computer when the GPUContext is created
 	// to handle the tradeoff on calling cudaMemGetInfo too often.
 	public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor";
+	public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single
 
 	// supported prefixes for custom map/reduce configurations
 	public static final String PREFIX_MAPRED = "mapred";
@@ -139,6 +140,7 @@ public class DMLConfig
 		_defaultVals.put(AVAILABLE_GPUS,         "-1");
 		_defaultVals.put(SYNCHRONIZE_GPU,        "true" );
 		_defaultVals.put(EAGER_CUDA_FREE,        "false" );
+		_defaultVals.put(FLOATING_POINT_PRECISION,        	 "double" );
 	}
 	
 	public DMLConfig()
@@ -421,7 +423,7 @@ public class DMLConfig
 				COMPRESSED_LINALG, 
 				CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS,
 				EXTRA_GPU_STATS, EXTRA_DNN_STATS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN,
-				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE
+				AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION
 		}; 
 		
 		StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index 5297e61..c7ffdb1 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -404,7 +404,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
                 LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
                 throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
             } else if (gObj != null){
-                copiedFromGPU = gObj.acquireHostRead();
+                copiedFromGPU = gObj.acquireHostRead(null);
                 if( _data == null )
                     getCache();
             }
@@ -793,7 +793,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
                 LOG.error("Inconsistent internal state - A copy of this CacheableData was dirty on more than 1 GPU");
                 throw new CacheException("Internal Error : Inconsistent internal state, A copy of this CacheableData was dirty on more than 1 GPU");
             } else if (gObj != null){
-                copiedFromGPU = gObj.acquireHostRead();
+                copiedFromGPU = gObj.acquireHostRead(null);
                 if( _data == null )
                     getCache();
             }

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 7176a9c..53f1a19 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -20,7 +20,6 @@
 package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.jcusparse.JCusparse.cusparseCreateMatDescr;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2dense;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatIndexBase;
 import static jcuda.jcusparse.JCusparse.cusparseSetMatType;
 import static jcuda.jcusparse.JCusparse.cusparseSetPointerMode;
@@ -38,6 +37,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
@@ -112,8 +112,8 @@ public class CSRPointer {
 		allocateMatDescrPointer();
 	}
 
-	private static long getDoubleSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.DOUBLE);
+	private static long getDataTypeSizeOf(long numElems) {
+		return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
 	}
 
 	//  private Pointer allocate(String instName, long size) throws DMLRuntimeException {
@@ -121,7 +121,7 @@ public class CSRPointer {
 	//  }
 
 	private static long getIntSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.INT);
+		return numElems * ((long) Sizeof.INT);
 	}
 
 	//  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
@@ -163,7 +163,7 @@ public class CSRPointer {
 	 * @return size estimate
 	 */
 	public static long estimateSize(long nnz2, long rows) {
-		long sizeofValArray = getDoubleSizeOf(nnz2);
+		long sizeofValArray = getDataTypeSizeOf(nnz2);
 		long sizeofRowPtrArray = getIntSizeOf(rows + 1);
 		long sizeofColIndArray = getIntSizeOf(nnz2);
 		long sizeofDescr = getIntSizeOf(4);
@@ -181,6 +181,7 @@ public class CSRPointer {
 	/**
 	 * Static method to copy a CSR sparse matrix from Host to Device
 	 *
+	 * @param gCtx GPUContext
 	 * @param dest   [input] destination location (on GPU)
 	 * @param rows   number of rows
 	 * @param nnz    number of non-zeroes
@@ -189,7 +190,7 @@ public class CSRPointer {
 	 * @param values double array of non zero values
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) throws DMLRuntimeException {
+	public static void copyToDevice(GPUContext gCtx, CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) throws DMLRuntimeException {
 		CSRPointer r = dest;
 		long t0 = 0;
 		if (DMLScript.STATISTICS)
@@ -200,15 +201,15 @@ public class CSRPointer {
 		if(rowPtr.length < rows + 1) throw new DMLRuntimeException("The length of rowPtr needs to be greater than or equal to " + (rows + 1));
 		if(colInd.length < nnz) throw new DMLRuntimeException("The length of colInd needs to be greater than or equal to " + nnz);
 		if(values.length < nnz) throw new DMLRuntimeException("The length of values needs to be greater than or equal to " + nnz);
+		LibMatrixCUDA.cudaSupportFunctions.hostToDevice(gCtx, values, r.val, null);
 		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
-		cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaToDevCount.add(3);
 	}
-
+	
 	/**
 	 * Static method to copy a CSR sparse matrix from Device to host
 	 *
@@ -217,20 +218,12 @@ public class CSRPointer {
 	 * @param nnz    [input] number of non-zeroes
 	 * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
 	 * @param colInd [output] pre-allocated integer array of column indices of size nnz
-	 * @param values [output] pre-allocated double array of values of size nnz
+	 * @throws DMLRuntimeException if error
 	 */
-	public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+	public static void copyPtrToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd) throws DMLRuntimeException {
 		CSRPointer r = src;
-		long t0 = 0;
-		if (DMLScript.STATISTICS)
-			t0 = System.nanoTime();
 		cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
 		cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
-		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevCount.add(3);
 	}
 
 	/**
@@ -305,9 +298,9 @@ public class CSRPointer {
 			// with no memory allocated on the GPU.
 			return r;
 		}
-		gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
+		gCtx.ensureFreeSpace(getDataTypeSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
 		// increment the cudaCount by 1 for the allocation of all 3 arrays
-		r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+		r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
 		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
 		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
 		return r;
@@ -410,7 +403,7 @@ public class CSRPointer {
 			throws DMLRuntimeException {
 		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
 		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-		C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+		C.val = gCtx.allocate(null, getDataTypeSizeOf(C.nnz));
 		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
 	}
 
@@ -441,13 +434,14 @@ public class CSRPointer {
 		that.gpuContext.ensureFreeSpace(totalSize);
 
 		that.nnz = me.nnz;
-		that.val = allocate(that.nnz * Sizeof.DOUBLE);
-		that.rowPtr = allocate(rows * Sizeof.DOUBLE);
-		that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
+		that.val = allocate(that.nnz * LibMatrixCUDA.sizeOfDataType);
+		// TODO: Nakul ... can you please double-check whether the below was a bug or intentional ?
+		that.rowPtr = allocate(rows * Sizeof.INT);
+		that.colInd = allocate(that.nnz * Sizeof.INT);
 
-		cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.val, me.val, that.nnz * LibMatrixCUDA.sizeOfDataType, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.INT, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.INT, cudaMemcpyDeviceToDevice);
 
 		return that;
 	}
@@ -506,12 +500,12 @@ public class CSRPointer {
 		long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0;
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
-		long size = ((long) rows) * getDoubleSizeOf((long) cols);
+		long size = ((long) rows) * getDataTypeSizeOf((long) cols);
 		Pointer A = allocate(size);
 		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
 		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
 			// Note: cusparseDcsr2dense method cannot handle empty blocks
-			cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			LibMatrixCUDA.cudaSupportFunctions.cusparsecsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
 			//cudaDeviceSynchronize;
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 55cb95f..dd776bc 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -24,8 +24,6 @@ import static jcuda.jcudnn.JCudnn.cudnnCreate;
 import static jcuda.jcudnn.JCudnn.cudnnDestroy;
 import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
 import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
 import static jcuda.jcusparse.JCusparse.cusparseCreate;
 import static jcuda.jcusparse.JCusparse.cusparseDestroy;
 import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
@@ -63,7 +61,6 @@ import jcuda.Pointer;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcudnn.cudnnHandle;
 import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.runtime.JCuda;
 import jcuda.runtime.cudaDeviceProp;
@@ -107,10 +104,6 @@ public class GPUContext {
 	 */
 	private cusolverDnHandle cusolverDnHandle;
 	/**
-	 * cusolverSpHandle for invoking solve() function on sparse matrices on the GPU
-	 */
-	private cusolverSpHandle cusolverSpHandle;
-	/**
 	 * to launch custom CUDA kernel, specific to the active GPU for this GPUContext
 	 */
 	private JCudaKernels kernels;
@@ -233,12 +226,7 @@ public class GPUContext {
 			cusolverDnHandle = new cusolverDnHandle();
 			cusolverDnCreate(cusolverDnHandle);
 		}
-
-		if (cusolverSpHandle == null) {
-			cusolverSpHandle = new cusolverSpHandle();
-			cusolverSpCreate(cusolverSpHandle);
-		}
-
+		
 		if (kernels == null) {
 			kernels = new JCudaKernels();
 		}
@@ -578,7 +566,7 @@ public class GPUContext {
 								+ "). Allocated GPU objects:" + allocatedGPUObjects.toString());
 			}
 			if (toBeRemoved.dirty) {
-				toBeRemoved.copyFromDeviceToHost();
+				toBeRemoved.copyFromDeviceToHost(instructionName);
 			}
 			toBeRemoved.clearData(true);
 		}
@@ -754,15 +742,6 @@ public class GPUContext {
 	}
 
 	/**
-	 * Returns cusolverSpHandle for invoking solve() function on sparse matrices on the GPU.
-	 *
-	 * @return cusolverSpHandle for current thread
-	 */
-	public cusolverSpHandle getCusolverSpHandle() {
-		return cusolverSpHandle;
-	}
-
-	/**
 	 * Returns utility class used to launch custom CUDA kernel, specific to the active GPU for this GPUContext.
 	 *
 	 * @return {@link JCudaKernels} for current thread
@@ -801,14 +780,10 @@ public class GPUContext {
 		if (cusolverDnHandle != null)
 			cusolverDnDestroy(cusolverDnHandle);
 
-		if (cusolverSpHandle != null)
-			cusolverSpDestroy(cusolverSpHandle);
-
 		cudnnHandle = null;
 		cublasHandle = null;
 		cusparseHandle = null;
 		cusolverDnHandle = null;
-		cusolverSpHandle = null;
 	}
 
 	/**
@@ -827,7 +802,7 @@ public class GPUContext {
 			if (o.isDirty()) {
 				LOG.warn("Attempted to free GPU Memory when a block[" + o
 						+ "] is still on GPU memory, copying it back to host.");
-				o.acquireHostRead();
+				o.acquireHostRead(null);
 			}
 			o.clearData(true);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 06327db..35dfd58 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -19,14 +19,10 @@
 package org.apache.sysml.runtime.instructions.gpu.context;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcusparse.JCusparse.cusparseDdense2csr;
-import static jcuda.jcusparse.JCusparse.cusparseDnnz;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.JCuda.cudaMemset;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 
@@ -47,9 +43,6 @@ import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
 import org.apache.sysml.utils.GPUStatistics;
 
 import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseDirection;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseMatDescr;
@@ -126,7 +119,7 @@ public class GPUObject {
 			if (me.jcudaDenseMatrixPtr != null) {
 				long rows = me.mat.getNumRows();
 				long cols = me.mat.getNumColumns();
-				long size = rows * cols * Sizeof.DOUBLE;
+				long size = rows * cols * LibMatrixCUDA.sizeOfDataType;
 				me.gpuContext.ensureFreeSpace((int) size);
 				that.jcudaDenseMatrixPtr = allocate(size);
 				cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
@@ -181,13 +174,13 @@ public class GPUObject {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : transpose of block of size [" + m + "," + n + "]" + ", GPUContext=" + gCtx);
 		}
-		Pointer alpha = Pointer.to(new double[] { 1.0 });
-		Pointer beta = Pointer.to(new double[] { 0.0 });
+		Pointer alpha = LibMatrixCUDA.one();
+		Pointer beta = LibMatrixCUDA.zero();
 		Pointer A = densePtr;
-		Pointer C = gCtx.allocate(((long) m) * getDoubleSizeOf(n));
+		Pointer C = gCtx.allocate(((long) m) * getDatatypeSizeOf(n));
 
 		// Transpose the matrix to get a dense matrix
-		JCublas2.cublasDgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
+		LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, new Pointer(),
 				lda, C, ldc);
 		return C;
 	}
@@ -217,7 +210,7 @@ public class GPUObject {
 		nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1));
 
 		// Output is in dense vector format, convert it to CSR
-		cusparseDnnz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
+		LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, densePtr, rows,
 				nnzPerRowPtr, nnzTotalDevHostPtr);
 		//cudaDeviceSynchronize();
 		int[] nnzC = { -1 };
@@ -241,7 +234,7 @@ public class GPUObject {
 		}
 
 		CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
-		cusparseDdense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
+		LibMatrixCUDA.cudaSupportFunctions.cusparsedense2csr(cusparseHandle, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr,
 				C.colInd);
 		//cudaDeviceSynchronize();
 
@@ -252,31 +245,6 @@ public class GPUObject {
 	}
 
 	/**
-	 * Gets the double array from GPU memory onto host memory and returns string.
-	 *
-	 * @param A    Pointer to memory on device (GPU), assumed to point to a double array
-	 * @param rows rows in matrix A
-	 * @param cols columns in matrix A
-	 * @return the debug string
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static String debugString(Pointer A, long rows, long cols) throws DMLRuntimeException {
-		StringBuffer sb = new StringBuffer();
-		int len = toIntExact(rows * cols);
-		double[] tmp = new double[len];
-		cudaMemcpy(Pointer.to(tmp), A, getDoubleSizeOf(len), cudaMemcpyDeviceToHost);
-		int k = 0;
-		for (int i = 0; i < rows; i++) {
-			for (int j = 0; j < cols; j++) {
-				sb.append(tmp[k]).append(' ');
-				k++;
-			}
-			sb.append('\n');
-		}
-		return sb.toString();
-	}
-
-	/**
 	 * Convenience method to directly examine the Sparse matrix on GPU
 	 *
 	 * @return CSR (compressed sparse row) pointer
@@ -287,7 +255,7 @@ public class GPUObject {
 
 	/**
 	 * Convenience method to directly set the sparse matrix on GPU
-	 * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
+	 * Needed for operations like cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)
 	 *
 	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
 	 * @throws DMLRuntimeException ?
@@ -475,8 +443,8 @@ public class GPUObject {
 		return isSparse;
 	}
 	
-	private static long getDoubleSizeOf(long numElems) {
-		return numElems * ((long) jcuda.Sizeof.DOUBLE);
+	private static long getDatatypeSizeOf(long numElems) {
+		return numElems * LibMatrixCUDA.sizeOfDataType;
 	}
 
 	private static long getIntSizeOf(long numElems) {
@@ -524,7 +492,7 @@ public class GPUObject {
 		long rows = mat.getNumRows();
 		long cols = mat.getNumColumns();
 		int numElems = toIntExact(rows * cols);
-		long size = getDoubleSizeOf(numElems);
+		long size = getDatatypeSizeOf(numElems);
 		setDenseMatrixCudaPointer(allocate(size));
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
@@ -609,10 +577,11 @@ public class GPUObject {
 	/**
 	 * if the data is allocated on the GPU and is dirty, it is copied back to the host memory
 	 *
+	 * @param instName name of the instruction
 	 * @return true if a copy to host happened, false otherwise
 	 * @throws CacheException ?
 	 */
-	public boolean acquireHostRead() throws CacheException {
+	public boolean acquireHostRead(String instName) throws CacheException {
 		boolean copied = false;
 		try {
 			if(LOG.isTraceEnabled()) {
@@ -623,7 +592,7 @@ public class GPUObject {
 					LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext="
 						+ getGPUContext());
 				}
-				copyFromDeviceToHost();
+				copyFromDeviceToHost(instName);
 				copied = true;
 			}
 		} catch (DMLRuntimeException e) {
@@ -728,7 +697,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix");
 		if(cols <= 0)
 			throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix;");
-		long size = getDoubleSizeOf(rows * cols);
+		long size = getDatatypeSizeOf(rows * cols);
 		Pointer tmp = allocate(size);
 		setDenseMatrixCudaPointer(tmp);
 	}
@@ -774,7 +743,7 @@ public class GPUObject {
 		if (LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) {
 			GPUSize = CSRPointer.estimateSize(nnz, rlen);
 		} else {
-			GPUSize = getDoubleSizeOf(rlen * clen);
+			GPUSize = getDatatypeSizeOf(rlen * clen);
 		}
 		return GPUSize;
 	}
@@ -858,7 +827,7 @@ public class GPUObject {
 
 			if (copyToDevice) {
 				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
+				CSRPointer.copyToDevice(getGPUContext(), getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd,
 						values);
 				if(GPUStatistics.DISPLAY_STATISTICS) 
 					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
@@ -877,18 +846,14 @@ public class GPUObject {
 				// Minor optimization: No need to allocate empty error for CPU 
 				// data = new double[tmp.getNumRows() * tmp.getNumColumns()];
 				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				cudaMemset(getJcudaDenseMatrixPtr(), 0, getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()));
+				cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns()));
 				if(GPUStatistics.DISPLAY_STATISTICS) 
 					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1);
 			}
 			else {
 				// Copy dense block
 				// H2D now only measures the time taken to do 
-				long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-				cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data),
-						getDoubleSizeOf(mat.getNumRows() * mat.getNumColumns()), cudaMemcpyHostToDevice);
-				if(GPUStatistics.DISPLAY_STATISTICS) 
-					GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+				LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode);
 			}
 		}
 
@@ -907,7 +872,7 @@ public class GPUObject {
 		return (int) l;
 	}
 
-	protected void copyFromDeviceToHost() throws DMLRuntimeException {
+	protected void copyFromDeviceToHost(String instName) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext());
 		}
@@ -921,11 +886,7 @@ public class GPUObject {
 				start = System.nanoTime();
 			MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false);
 			tmp.allocateDenseBlock();
-			double[] data = tmp.getDenseBlock();
-
-			cudaMemcpy(Pointer.to(data), getJcudaDenseMatrixPtr(), getDoubleSizeOf(data.length),
-					cudaMemcpyDeviceToHost);
-
+			LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaDenseMatrixPtr(), tmp.getDenseBlock(), instName);
 			tmp.recomputeNonZeros();
 			mat.acquireModify(tmp);
 			mat.release();
@@ -951,10 +912,16 @@ public class GPUObject {
 				int rows = toIntExact(mat.getNumRows());
 				int cols = toIntExact(mat.getNumColumns());
 				int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz);
+				double[] values = new double[nnz];
+				LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName);
 				int[] rowPtr = new int[rows + 1];
 				int[] colInd = new int[nnz];
-				double[] values = new double[nnz];
-				CSRPointer.copyToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd, values);
+				long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
+				if (DMLScript.STATISTICS)
+					GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
+				if (DMLScript.STATISTICS)
+					GPUStatistics.cudaFromDevCount.add(3);
 
 				SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
 				MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index e1894ae..d22110d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -29,6 +29,7 @@ import java.util.HashMap;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.io.IOUtilFunctions;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 
 import jcuda.Pointer;
 import jcuda.driver.CUfunction;
@@ -72,11 +73,17 @@ public class JCudaKernels {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public void launchKernel(String name, ExecutionConfig config, Object... arguments) throws DMLRuntimeException {
+		name = name + LibMatrixCUDA.customKernelSuffix;
 		CUfunction function = kernels.get(name);
+		
 		if (function == null) {
 			// caching functions into hashmap reduces the lookup overhead
 			function = new CUfunction();
-			checkResult(cuModuleGetFunction(function, module, name));
+			try {
+				checkResult(cuModuleGetFunction(function, module, name));
+			} catch(jcuda.CudaException e) {
+				throw new DMLRuntimeException("Error finding the custom kernel:" + name, e);
+			}
 		}
 
 		// Setup parameters

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
new file mode 100644
index 0000000..2b6c039
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/CudaSupportFunctions.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,  * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+import jcuda.Pointer;
+
+/**
+ * DESIGN DOCUMENTATION FOR SUPPORTING LOWER PRECISION:
+ * 1. SystemML.cu has been templatized in following way to support different datatype:
+ * - Similar to CuBLAS and CuSPARSE, the global kernels have the datatype specification in their name (for example: f for float
+ * and d for datatpe). But unlike CuBLAS and CuSPARSE, these are suffixes so as to simplify the engine.  
+ * - The global kernels with datatype specification invoke a corresponding templatized kernel (without suffix) which contains the core logic.
+ * - The suffixes are added in JCudaKernels's launchKernel method before invocation.
+ * For example:
+ * <code>
+ * template &lt; typename T &gt;
+ * __device__ void matrix_atan(T *A, T *C, unsigned int size) {
+ *     int index = blockIdx.x * blockDim.x + threadIdx.x;
+ *     if (index &lt; size){
+ *         C[index] = atan(A[index]);
+ *     }
+ * }
+ * extern "C" __global__ void matrix_atand(double *A, double *C, unsigned int size) {
+ * 	matrix_atan(A, C, size);
+ * }
+ * extern "C" __global__ void matrix_atanf(float *A, float *C, unsigned int size) {
+ * 	matrix_atan(A, C, size);
+ * } 
+ * </code>
+ * 
+ * 2. The CUDA library calls (such as CuBLAS, CuSPARSE, etc) go through this interface.
+ * The naming and parameters of the methods in this class are consistent with that of CUDA library to simplify development.
+ * 
+ * 3. During SystemML initialization, the appropriate class implementing CudaKernels interface is set based on the configuration property sysml.dataType.
+ */
+public interface CudaSupportFunctions {
+	public static boolean PERFORM_CONVERSION_ON_DEVICE = true;
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k, 
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, 
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, 
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC);
+	public int	cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, 
+			int lda, jcuda.Pointer beta, jcuda.Pointer B, int ldb, jcuda.Pointer C, int ldc);
+	public int	cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer x, jcuda.Pointer beta, jcuda.Pointer y);
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cublasdot(cublasHandle handle, int n, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy, jcuda.Pointer result);
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer x, int incx, jcuda.Pointer beta, jcuda.Pointer y, int incy);
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, jcuda.Pointer csrVal, jcuda.Pointer csrRowPtr, jcuda.Pointer csrColInd, jcuda.Pointer cscVal, jcuda.Pointer cscRowInd, jcuda.Pointer cscColPtr, int copyValues, int idxBase);
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer beta, jcuda.Pointer C, int ldc);
+	public int cublasaxpy(cublasHandle handle, int n, jcuda.Pointer alpha, jcuda.Pointer x, int incx, jcuda.Pointer y, int incy);
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, jcuda.Pointer alpha, jcuda.Pointer A, int lda, jcuda.Pointer B, int ldb);
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork);
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU, Pointer Workspace, int Lwork, Pointer devInfo);
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda, Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo);
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, jcuda.Pointer alpha, cusparseMatDescr descrA, int nnzA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer beta, cusparseMatDescr descrB, int nnzB, jcuda.Pointer csrValB, jcuda.Pointer csrRowPtrB, jcuda.Pointer csrColIndB, cusparseMatDescr descrC, jcuda.Pointer csrValC, jcuda.Pointer csrRowPtrC, jcuda.Pointer csrColIndC);
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, jcuda.Pointer A, int lda) ;
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRow, jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA);
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, jcuda.Pointer A, int lda, jcuda.Pointer nnzPerRowCol, jcuda.Pointer nnzTotalDevHostPtr);
+	public void deviceToHost(GPUContext gCtx, Pointer src, double [] dest, String instName) throws DMLRuntimeException;
+	public void hostToDevice(GPUContext gCtx, double [] src,  Pointer dest, String instName) throws DMLRuntimeException;
+	
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
new file mode 100644
index 0000000..78b4de0
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DoublePrecisionCudaSupportFunctions.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.JCusolverDn;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+public class DoublePrecisionCudaSupportFunctions implements CudaSupportFunctions {
+
+	@Override
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseDcsrgemm(handle, transA,  transB,  m,  n,  k,
+				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
+				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
+				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	}
+	
+	@Override
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		return JCublas2.cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+	}
+	
+	@Override
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
+			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+			Pointer y) {
+		return JCusparse.cusparseDcsrmv(handle, transA, m, n, nnz, alpha, 
+				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+	}
+	
+	@Override
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
+			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
+		return JCusparse.cusparseDcsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
+				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	}
+	
+	@Override
+	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
+		return JCublas2.cublasDdot(handle, n, x, incx, y, incy, result);
+	}
+	
+	@Override
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
+			int incx, Pointer beta, Pointer y, int incy) {
+		return JCublas2.cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+	}
+	
+	@Override
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
+			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	}
+	
+	@Override
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
+			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		return JCusparse.cusparseDcsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+	}
+	
+	@Override
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+	}
+	
+	@Override
+	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
+		return JCublas2.cublasDaxpy(handle, n, alpha, x, incx, y, incy);
+	}
+	
+	@Override
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, Pointer alpha,
+			Pointer A, int lda, Pointer B, int ldb) {
+		return JCublas2.cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+	}
+
+	@Override
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork) {
+		return JCusolverDn.cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+	}
+	
+	@Override
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU,
+			Pointer Workspace, int Lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+	}
+
+	@Override
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
+			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+	}
+	
+	@Override
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
+			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+			Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseDcsrgeam(handle, m, n, alpha, descrA, nnzA, 
+				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
+				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+	}
+	
+	@Override
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
+			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		return JCusparse.cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+	}
+
+	@Override
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		return JCusparse.cusparseDdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+	}
+
+	@Override
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		return JCusparse.cusparseDnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+	}
+
+	@Override
+	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		cudaMemcpy(Pointer.to(dest), src, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
+	}
+
+	@Override
+	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		cudaMemcpy(dest, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 7e25299..eb17e69 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -21,12 +21,13 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
-import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -80,14 +81,11 @@ import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
 import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
 import jcuda.jcublas.cublasDiagType;
 import jcuda.jcublas.cublasFillMode;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcublas.cublasOperation;
 import jcuda.jcublas.cublasSideMode;
-import jcuda.jcusolver.JCusolverDn;
-import jcuda.jcusparse.JCusparse;
 import jcuda.jcusparse.cusparseAction;
 import jcuda.jcusparse.cusparseHandle;
 import jcuda.jcusparse.cusparseIndexBase;
@@ -100,6 +98,34 @@ import jcuda.jcusparse.cusparseIndexBase;
 public class LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
+	
+	protected static int CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+	// The below variables are used in CSRPointer, GPUObjects, etc.
+	public static CudaSupportFunctions cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
+	public static int sizeOfDataType = jcuda.Sizeof.DOUBLE;
+	public static String customKernelSuffix = "_d";
+	
+	/**
+	 * Sets the internal state based on the DMLScript.DATA_TYPE
+	 * @throws DMLRuntimeException if error
+	 */
+	public static void resetFloatingPointPrecision() throws DMLRuntimeException {
+		if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("double")) {
+			LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+			LibMatrixCUDA.cudaSupportFunctions = new DoublePrecisionCudaSupportFunctions();
+			LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.DOUBLE;
+			LibMatrixCUDA.customKernelSuffix = "_d";
+		}
+		else if(DMLScript.FLOATING_POINT_PRECISION.equalsIgnoreCase("single")) {
+			LibMatrixCUDA.CUDNN_DATA_TYPE = jcuda.jcudnn.cudnnDataType.CUDNN_DATA_FLOAT;
+			LibMatrixCUDA.cudaSupportFunctions = new SinglePrecisionCudaSupportFunctions();
+			LibMatrixCUDA.sizeOfDataType = jcuda.Sizeof.FLOAT;
+			LibMatrixCUDA.customKernelSuffix = "_f";
+		}
+		else {
+			throw new DMLRuntimeException("Unsupported floating point precision: " + DMLScript.FLOATING_POINT_PRECISION);
+		}
+	}
 
 	// Assume Compute Capability 3.0
 	// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
@@ -110,7 +136,7 @@ public class LibMatrixCUDA {
 	
 	// From CuDNN 5.1 documentation:
 	// The total size of a tensor including the potential padding between dimensions is limited to 2 Giga-elements of type datatype.
-	protected static long maxNumDoublesOfCuDNNTensor = 2000000000;
+	protected static long maxNumElementsOfCuDNNTensor = 2000000000;
 
 	//********************************************************************/
 	//***************************** UTILS ********************************/
@@ -179,7 +205,18 @@ public class LibMatrixCUDA {
 	protected static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getKernels();
 	}
-
+	
+	public static Pointer double2float(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
+		getCudaKernels(gCtx).launchKernel("double2float", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+				A, ret, numElems);
+		return ret;
+	}
+	
+	public static Pointer float2double(GPUContext gCtx, Pointer A, Pointer ret, int numElems) throws DMLRuntimeException {
+		getCudaKernels(gCtx).launchKernel("float2double", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+				A, ret, numElems);
+		return ret;
+	}
 
 	//********************************************************************/
 	//************************ End of UTILS ******************************/
@@ -191,13 +228,15 @@ public class LibMatrixCUDA {
 
 	private static Pointer _one;
 	private static Pointer _zero;
+	private static int oldDataTypeSize;
 	/**
 	 * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	protected static Pointer one() {
-		if(_one == null) {
-			_one = pointerTo(1.0);
+	public static Pointer one() {
+		if(_one == null || oldDataTypeSize != sizeOfDataType) {
+			_one = dataTypePointerTo(1.0);
+			oldDataTypeSize = sizeOfDataType;
 		}
 		return _one;
 	}
@@ -205,9 +244,10 @@ public class LibMatrixCUDA {
 	 * Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation.
 	 * @return jcuda pointer
 	 */
-	protected static Pointer zero() {
-		if(_zero == null) {
-			_zero = pointerTo(0.0f);
+	public static Pointer zero() {
+		if(_zero == null  || oldDataTypeSize != sizeOfDataType) {
+			_zero = dataTypePointerTo(0.0);
+			oldDataTypeSize = sizeOfDataType;
 		}
 		return _zero;
 	}
@@ -242,8 +282,16 @@ public class LibMatrixCUDA {
 		return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 	}
 	
-	protected static Pointer pointerTo(double value) {
-		return Pointer.to(new double[] { value });
+	protected static Pointer dataTypePointerTo(double value) {
+		if(sizeOfDataType == Sizeof.DOUBLE) {
+			return Pointer.to(new double[] { value });
+		}
+		else if(sizeOfDataType == Sizeof.FLOAT) {
+			return Pointer.to(new float[] { (float) value });
+		}
+		else {
+			throw new RuntimeException("Unsupported datatype with size " + sizeOfDataType);
+		}
 	}
 	
 
@@ -434,7 +482,7 @@ public class LibMatrixCUDA {
 		long t0=0, t1=0;
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCublas2.cublasDsyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
+		cudaSupportFunctions.cublassyrk(getCublasHandle(gCtx), cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
@@ -630,7 +678,7 @@ public class LibMatrixCUDA {
 		}
 		case OP_PLUS_SQ : {
 			// Calculate the squares in a temporary object tmp
-			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
 
 			squareMatrix(gCtx, instName, in, tmp, rlen, clen);
 			// Then do the sum on the temporary object and free it
@@ -729,8 +777,8 @@ public class LibMatrixCUDA {
 		}
 		case OP_VARIANCE : {
 			// Temporary GPU array for
-			Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE);
-			Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE);
+			Pointer tmp = gCtx.allocate(instName, size * sizeOfDataType);
+			Pointer tmp2 = gCtx.allocate(instName, size * sizeOfDataType);
 
 			switch(reductionDirection) {
 
@@ -758,7 +806,7 @@ public class LibMatrixCUDA {
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE);
+				Pointer tmpRow = gCtx.allocate(instName, rlen * sizeOfDataType);
 				reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
@@ -776,7 +824,7 @@ public class LibMatrixCUDA {
 
 				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);
 
-				Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE);
+				Pointer tmpCol = gCtx.allocate(instName, clen * sizeOfDataType);
 				reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
 				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
@@ -847,9 +895,9 @@ public class LibMatrixCUDA {
 		int[] tmp = getKernelParamsForReduceAll(gCtx, n);
 		int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
-		Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE);
+		Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType);
 
-		long t1=0,t2=0,t3=0;
+		long t1=0,t2=0;
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
@@ -867,11 +915,7 @@ public class LibMatrixCUDA {
 			s = (s + (threads*2-1)) / (threads*2);
 		}
 		double[] result = {-1f};
-
-		if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
-		cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
-
+		cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName);
 		gCtx.cudaFreeHelper(instName, tempOut);
 		return result[0];
 	}
@@ -946,7 +990,7 @@ public class LibMatrixCUDA {
 		int blocks = (n + (threads * 2 - 1)) / (threads * 2);
 		blocks = Math.min(MAX_BLOCKS, blocks);
 
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *= 2;
 		}
@@ -965,7 +1009,7 @@ public class LibMatrixCUDA {
 		final int MAX_THREADS = getMaxThreads(gCtx);
 		int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS;
 		int blocks = rows;
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *=2;
 		}
@@ -979,7 +1023,7 @@ public class LibMatrixCUDA {
 		int threads = Math.min(cols, MAX_THREADS);
 		int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS);
 		if (cols % MAX_THREADS != 0) blocks++;
-		int sharedMemSize = threads * Sizeof.DOUBLE;
+		int sharedMemSize = threads * sizeOfDataType;
 		if (threads <= WARP_SIZE){
 			sharedMemSize *=2;
 		}
@@ -1475,7 +1519,7 @@ public class LibMatrixCUDA {
 	private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
 		long t0=0;
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		int size = rlen * clen * Sizeof.DOUBLE;
+		int size = rlen * clen * sizeOfDataType;
 		cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0);
 	}
@@ -1538,8 +1582,8 @@ public class LibMatrixCUDA {
 			LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx);
 		}
 
-		Pointer alphaPtr = pointerTo(alpha);
-		Pointer betaPtr = pointerTo(beta);
+		Pointer alphaPtr = dataTypePointerTo(alpha);
+		Pointer betaPtr = dataTypePointerTo(beta);
 		int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 		int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
 
@@ -1584,7 +1628,7 @@ public class LibMatrixCUDA {
 				int nnz = (int)A.nnz;
 				CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnz, n);
 				out.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-				cusparseDcsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
+				cudaSupportFunctions.cusparsecsr2csc(getCusparseHandle(gCtx), m, n, nnz, A.val, A.rowPtr, A.colInd, C.val, C.colInd, C.rowPtr, cusparseAction.CUSPARSE_ACTION_NUMERIC, cusparseIndexBase.CUSPARSE_INDEX_BASE_ZERO);
 			} else {
 				// General case (cusparse does not support accept the transpose operator for dgeam)
 				// TODO: to implement the transposed + dgeam for sparse matrices, they need to be converted to csc, which is effectively a tranpose
@@ -1604,7 +1648,7 @@ public class LibMatrixCUDA {
 				//long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
 				if (GPUStatistics.DISPLAY_STATISTICS)
 					t0 = System.nanoTime();
-				JCusparse.cusparseDcsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
+				cudaSupportFunctions.cusparsecsrgeam(getCusparseHandle(gCtx), m, n, alphaPtr, A.descr, toInt(A.nnz), A.val, A.rowPtr, A.colInd, betaPtr,
 						B.descr, toInt(B.nnz), B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd);
 				//cudaDeviceSynchronize;
 				if (GPUStatistics.DISPLAY_STATISTICS)
@@ -1635,7 +1679,7 @@ public class LibMatrixCUDA {
 			Pointer C = getDensePointer(gCtx, out, instName);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-			JCublas2.cublasDgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
+			cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
 		}
 	}
@@ -1673,7 +1717,7 @@ public class LibMatrixCUDA {
 	//******************* End of Re-org Functions ************************/
 	//********************************************************************/
 
-	static int toInt(long num) throws DMLRuntimeException {
+	public static int toInt(long num) throws DMLRuntimeException {
 		if(num >= Integer.MAX_VALUE || num <= Integer.MIN_VALUE) {
 			throw new DMLRuntimeException("GPU : Exceeded supported size " + num);
 		}
@@ -1751,8 +1795,8 @@ public class LibMatrixCUDA {
 		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		long retClen = cu - cl + 1;
 		if (inClen == retClen) {
-			cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * Sizeof.DOUBLE), (ru - rl + 1) * inClen
-					* Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+			cudaMemcpy(outPointer, inPointer.withByteOffset(rl * inClen * sizeOfDataType), (ru - rl + 1) * inClen
+					* sizeOfDataType, cudaMemcpyDeviceToDevice);
 		} else {
 			long retRlen = ru - rl + 1;
 			getCudaKernels(gCtx).launchKernel("slice_dense_dense", ExecutionConfig.getConfigForSimpleVectorOperations(toInt(retRlen*retClen)),
@@ -2255,17 +2299,17 @@ public class LibMatrixCUDA {
 
 			// Matrix-Matrix daxpy
 			long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
-			Pointer alphaPtr = pointerTo(constant);
+			Pointer alphaPtr = dataTypePointerTo(constant);
 			// C <- A + alpha*B
 			// becomes
 			// C <- A
 			// C <- alpha*B + C
 			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			cudaMemcpy(C, A, n*((long)jcuda.Sizeof.DOUBLE), cudaMemcpyDeviceToDevice);
+			cudaMemcpy(C, A, n*((long)sizeOfDataType), cudaMemcpyDeviceToDevice);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
 
 			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			JCublas2.cublasDaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1);
+			cudaSupportFunctions.cublasaxpy(getCublasHandle(gCtx), toInt(n), alphaPtr, B, 1, C, 1);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
 		}
 		else {
@@ -2353,15 +2397,15 @@ public class LibMatrixCUDA {
 		// step 3: query working space of geqrf and ormqr
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
 		int[] lwork = {0};
-		JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
+		cudaSupportFunctions.cusolverDngeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0);
 
 		// step 4: compute QR factorization
-		Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
-		Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
+		Pointer work = gCtx.allocate(instName, lwork[0] * sizeOfDataType);
+		Pointer tau = gCtx.allocate(instName, m * sizeOfDataType);
 		Pointer devInfo = gCtx.allocate(Sizeof.INT);
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
+		cudaSupportFunctions.cusolverDngeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0);
 
 		int[] qrError = {-1};
@@ -2372,7 +2416,7 @@ public class LibMatrixCUDA {
 
 		// step 5: compute Q^T*B
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
+		cudaSupportFunctions.cusolverDnormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0);
 		cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
 		if (qrError[0] != 0) {
@@ -2381,9 +2425,9 @@ public class LibMatrixCUDA {
 
 		// step 6: compute x = R \ Q^T*B
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		JCublas2.cublasDtrsm(gCtx.getCublasHandle(),
+		cudaSupportFunctions.cublastrsm(gCtx.getCublasHandle(),
 			cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT,
-			n, 1, pointerTo(1.0), A, m, b, m);
+			n, 1, dataTypePointerTo(1.0), A, m, b, m);
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0);
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
@@ -2393,7 +2437,7 @@ public class LibMatrixCUDA {
 		// TODO  : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash
 		// There is an avoidable copy happening here
 		MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1);
-		cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice);
 
 		gCtx.cudaFreeHelper(instName, work);
 		gCtx.cudaFreeHelper(instName, tau);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index bb74aa2..7fd766c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -30,13 +30,11 @@ import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
 import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.CudaException;
 import jcuda.Pointer;
-import jcuda.Sizeof;
 import jcuda.jcudnn.cudnnActivationDescriptor;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
 import jcuda.jcudnn.cudnnHandle;
@@ -131,7 +129,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 		
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2d
 			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -155,7 +153,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image)) {
 						for(int n = 0; n < N; n++) {
 							// Perform one-input all-channel conv2d
-							cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*Sizeof.DOUBLE), algo);
+							cudnnConv2d(gCtx, instName, imgFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*KPQ*sizeOfDataType), algo);
 						}
 					}
 				}
@@ -180,7 +178,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4) throws DMLRuntimeException {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
-				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "]. Output dimension:  [" + dim3 + "," + dim4 + "].");
 	}
 
@@ -197,7 +195,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	private static void throwCuDNNDimensionError(long dim1, long dim2, long dim3, long dim4, long dim5, long dim6) throws DMLRuntimeException {
 		throw new DMLRuntimeException("The dimensions of input/output matrices is too large to execute a CuDNN kernel. "
-				+ "Max CuDNN matrix size:" + maxNumDoublesOfCuDNNTensor + ". "
+				+ "Max CuDNN matrix size:" + maxNumElementsOfCuDNNTensor + ". "
 				+ "Given input matrix dimensions: [" + dim1 + "," + dim2 + "], [" + dim3 + "," + dim4 + "]. Output dimension: [" + dim5 + "," + dim6 + "]");
 	}
 
@@ -270,7 +268,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 		
 		
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			Pointer dwPointer = getDensePointerForCuDNN(gCtx, outputBlock, instName);
 			double overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
@@ -292,10 +290,10 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 					try(LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 						LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						// Perform one-input conv2dBackwardFilter
-						Pointer tempdwPointer = gCtx.allocate(KCRS*Sizeof.DOUBLE);
+						Pointer tempdwPointer = gCtx.allocate(KCRS*sizeOfDataType);
 						for(int n = 0; n < N; n++) {
 							long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-							cudaMemset(tempdwPointer, 0, KCRS*Sizeof.DOUBLE);
+							cudaMemset(tempdwPointer, 0, KCRS*sizeOfDataType);
 							if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
 							// Perform one-input conv2dBackwardFilter
 							cudnnConv2dBackwardFilter(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), tempdwPointer, algo);
@@ -376,7 +374,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long KPQ = K*P*Q; long CRS = C*R*S; 
 		long NCHW = N*CHW; long NKPQ = N*KPQ; long KCRS = K*CRS;
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NKPQ < maxNumDoublesOfCuDNNTensor && KCRS < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NKPQ < maxNumElementsOfCuDNNTensor && KCRS < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			double overhead = isInSparseFormat(gCtx, filter) ? OptimizerUtils.estimateSizeExactSparsity(K, CRS, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, KPQ, 1.0) : 0;
@@ -398,7 +396,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 				else {
 					try(LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout)) {
 						for(int n = 0; n < N; n++) {
-							cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*Sizeof.DOUBLE), algo);
+							cudnnConv2dBackwardData(gCtx, instName, doutFetcher.getNthRow(n), filterPointer, dstPointer.withByteOffset(n*CHW*sizeOfDataType), algo);
 						}
 					}
 				}
@@ -468,7 +466,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			Pointer y = getDensePointerForCuDNN(gCtx, outputBlock, instName);
@@ -479,7 +477,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			else {
 				LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 				for(int n = 0; n < N; n++) {
-					cudnnMaxpooling(gCtx, instName, imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*Sizeof.DOUBLE), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+					cudnnMaxpooling(gCtx, instName, imgFetcher.getNthRow(n), y.withByteOffset(n*CPQ*sizeOfDataType), 1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				imgFetcher.close();
 			}
@@ -545,7 +543,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
-		if(NCHW < maxNumDoublesOfCuDNNTensor && NCPQ < maxNumDoublesOfCuDNNTensor) {
+		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
 			overhead += isInSparseFormat(gCtx, dout) ? OptimizerUtils.estimateSizeExactSparsity(N, CPQ, 1.0) : 0;
@@ -560,7 +558,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 				LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
 				for(int n = 0; n < N; n++) {
 					cudnnMaxpoolingBackward(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
-							dx.withByteOffset(n*CHW*Sizeof.DOUBLE), 
+							dx.withByteOffset(n*CHW*sizeOfDataType), 
 							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				// Deallocate temporary array to hold one element of input
@@ -591,7 +589,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			
 			// Calling PoolForward first, y is one of the inputs for poolBackward
 			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
-			long numBytes = N*C*P*Q*Sizeof.DOUBLE;
+			long numBytes = N*C*P*Q*sizeOfDataType;
 			y = gCtx.allocate(numBytes);
 			
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
@@ -668,7 +666,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		MatrixObject output = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix
 		long t0=0;
-		if(N*CHW >= maxNumDoublesOfCuDNNTensor) {
+		if(N*CHW >= maxNumElementsOfCuDNNTensor) {
 			if(LOG.isTraceEnabled()) {
 				LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx);
 			}
@@ -684,7 +682,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		else {
 			cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 			cudnnCreateTensorDescriptor(tensorDescriptor);
-			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, toInt(N), 1, 1, toInt(CHW));
+			cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW));
 			cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor);
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 		}
@@ -701,7 +699,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 */
 	protected static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName) throws DMLRuntimeException {
 		long numElems = image.getNumRows()*image.getNumColumns();
-		if(numElems > maxNumDoublesOfCuDNNTensor) {
+		if(numElems > maxNumElementsOfCuDNNTensor) {
 			throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size.");
 		}
 		return getDensePointer(gCtx, image, instName);
@@ -717,4 +715,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
index f49433d..ee22541 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java
@@ -40,7 +40,6 @@ import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
 import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
 
 /**
@@ -255,14 +254,14 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
-		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}
 	
 	private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
 		cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
 		cudnnCreateFilterDescriptor(filterDesc);
-		cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, R, S);
+		cudnnSetFilter4dDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, K, C, R, S);
 		return filterDesc;
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index 581607e..5121c87 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -20,8 +20,6 @@ package org.apache.sysml.runtime.matrix.data;
 
 import static jcuda.runtime.JCuda.cudaMemset;
 import jcuda.Pointer;
-import jcuda.Sizeof;
-
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -32,7 +30,7 @@ import org.apache.sysml.utils.GPUStatistics;
 /**
  * Performs a slice operation: out = in[(n+1):(n+1), 1:numColumns]
  */
-public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
+public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java.lang.AutoCloseable {
 	GPUContext gCtx; String instName; int numColumns; boolean isInputInSparseFormat; 
 	Object inPointer; // can be either CSRPointer or Pointer 
 	Pointer outPointer;
@@ -50,7 +48,7 @@ public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
 		numColumns = LibMatrixCUDA.toInt(image.getNumColumns());
 		isInputInSparseFormat = LibMatrixCUDA.isInSparseFormat(gCtx, image);
 		inPointer = isInputInSparseFormat ? LibMatrixCUDA.getSparsePointer(gCtx, image, instName) : LibMatrixCuDNN.getDensePointerForCuDNN(gCtx, image, instName);
-		outPointer = gCtx.allocate(numColumns*Sizeof.DOUBLE);
+		outPointer = gCtx.allocate(numColumns*sizeOfDataType);
 	}
 	/**
 	 * Copy the nth row and return the dense pointer
@@ -62,7 +60,7 @@ public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
 		if(isInputInSparseFormat) {
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			cudaMemset(outPointer, 0, numColumns*Sizeof.DOUBLE);
+			cudaMemset(outPointer, 0, numColumns*sizeOfDataType);
 			jcuda.runtime.JCuda.cudaDeviceSynchronize();
 			if(GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t0);
 			LibMatrixCUDA.sliceSparseDense(gCtx, instName, (CSRPointer)inPointer, outPointer, n, n, 0, LibMatrixCUDA.toInt(numColumns-1), numColumns);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
index f817bd5..d4b213f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNPoolingDescriptors.java
@@ -24,7 +24,6 @@ import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
-import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
 import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
@@ -141,7 +140,7 @@ public class LibMatrixCuDNNPoolingDescriptors implements java.lang.AutoCloseable
 	private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) throws DMLRuntimeException {
 		cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor();
 		cudnnCreateTensorDescriptor(tensorDescriptor);
-		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
+		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, LibMatrixCUDA.CUDNN_DATA_TYPE, N, C, H, W);
 		return tensorDescriptor;
 	}

[30/50] [abbrv] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by re...@apache.org.

[SYSTEMML-1969] Support single-precision operations on GPU backend

- Since single-precision operations are faster on most GPU, we should allow our users to perform the instructions on GPU in single precision.
- The GPU backend has been refactored to support arbitrary precision.
- This feature can be enabled via configuration property sysml.floating.point.precision.
- The valid values for this property are double and float. We can support half/mixed precision in a separate PR.

Closes #688.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/abbffc55
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/abbffc55
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/abbffc55

Branch: refs/heads/master
Commit: abbffc55ef8f47f10b6e59b0ae5e1f311f4a8f3e
Parents: 881caa9
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 19:25:20 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 19:26:50 2017 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |    3 +
 src/main/cpp/kernels/SystemML.cu                | 1874 ++--
 src/main/cpp/kernels/SystemML.ptx               | 9579 ++++++++++++++----
 .../java/org/apache/sysml/api/DMLScript.java    |    1 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |    4 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |    4 +-
 .../controlprogram/caching/CacheableData.java   |    4 +-
 .../instructions/gpu/context/CSRPointer.java    |   52 +-
 .../instructions/gpu/context/GPUContext.java    |   31 +-
 .../instructions/gpu/context/GPUObject.java     |   91 +-
 .../instructions/gpu/context/JCudaKernels.java  |    9 +-
 .../matrix/data/CudaSupportFunctions.java       |   87 +
 .../DoublePrecisionCudaSupportFunctions.java    |  175 +
 .../runtime/matrix/data/LibMatrixCUDA.java      |  144 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   38 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |    5 +-
 .../data/LibMatrixCuDNNInputRowFetcher.java     |    8 +-
 .../data/LibMatrixCuDNNPoolingDescriptors.java  |    3 +-
 .../runtime/matrix/data/LibMatrixCuMatMult.java |   34 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |    5 +-
 .../SinglePrecisionCudaSupportFunctions.java    |  208 +
 .../org/apache/sysml/test/gpu/GPUTests.java     |   20 +-
 .../test/gpu/MatrixMultiplicationOpTest.java    |   22 +-
 23 files changed, 9423 insertions(+), 2978 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 511e215..8452e75 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -93,6 +93,9 @@
     <!-- whether to perform eager CUDA free on rmvar instruction -->
     <sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree>
    
+    <!-- the floating point precision. supported values are double, single -->
+    <sysml.floating.point.precision>double</sysml.floating.point.precision>
+    
    <!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
    <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>
 </root>

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index c243564..d176f8f 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -26,11 +26,28 @@ nvcc -ptx -arch=sm_30 SystemML.cu
 #include <cfloat>
 #include <cmath>
 
+extern "C" __global__ void double2float_f(double *A, float *ret, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+  	// TODO: Use __double2float_rd or __double2float_rn  or __double2float_ru or __double2float_rz after 
+    ret[tid] = (float)A[tid];
+  }
+}
+
+extern "C" __global__ void float2double_f(float *A, double *ret, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+    ret[tid] = (double)A[tid];
+  }
+}
+
 /**
- * Performs a slice operation where the input matrix is sparse and the output matrix is dense.
- * This function avoids unnecessary sparse to dense conversion of the input matrix.
+ * Performs a slice operation where the input matrix is sparse and the output
+ * matrix is dense.
+ * This function avoids unnecessary sparse to dense conversion of the input
+ * matrix.
  * Parallelization: rows of output matrix.
- * 
+ *
  * @params inVal input val pointer
  * @params inRowPtr input row pointer
  * @params colInd input col index pointer
@@ -41,49 +58,73 @@ nvcc -ptx -arch=sm_30 SystemML.cu
  * @param cu column upper
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_sparse_dense_row(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-    int rl, int ru, int cl, int cu, int retClen) {
-  	int index = blockIdx.x * blockDim.x + threadIdx.x;
-	int rowIndex = index + rl;
-  	if (rowIndex <= ru){
-  		/*
-		 * TODO: Alternative approach: use dynamic parallelism. We are skipping this for now to avoid
-		 * the complexity of two-step separate compilation and linking process.
-		 *  
-		 * extern "C"
-		 * __global__ void slice_sparse_dense_row_helper(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-		 *     int rl, int ru, int cl, int cu, int retClen, int start, int end, int index) {
-		 *  int i = blockIdx.x * blockDim.x + threadIdx.x + start;   
-		 * 	// Only slice if the index falls into the given range
-		 * 	if(i < end && cl <= colInd[i] && colInd[i] <= cu) {
-		 * 		ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
-		 * 	}
-		 * }
-		 *
-		 * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex];
-		 * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL);
-		 * slice_sparse_dense_row_helper<<< ceil(numThreads/ MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr, colInd, ret, 
-    	 *			rl, ru, cl, cu, retClen, inRowPtr[rowIndex], inRowPtr[rowIndex+1], index);
-    	 *
-    	 * Two-step compilation and linking process in JCudaKernels's constructor:
-    	 * cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
-		 */
-    	// Iterate over elements of the row 'rowIndex'.
-    	for(int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex+1]; i++) {
-    		// Only slice if the index falls into the given range
-    		if(cl <= colInd[i] && colInd[i] <= cu) {
-    			ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
-    		}
-    	}
+template <typename T>
+__device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd,
+                                       T *ret, int rl, int ru, int cl, int cu,
+                                       int retClen) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int rowIndex = index + rl;
+  if (rowIndex <= ru) {
+    /*
+     * TODO: Alternative approach: use dynamic parallelism. We are skipping this
+*for now to avoid
+     * the complexity of two-step separate compilation and linking process.
+     *
+     * extern "C"
+     * __global__ void slice_sparse_dense_row_helper(double* inVal, int*
+*inRowPtr, int* colInd, double* ret,
+     *     int rl, int ru, int cl, int cu, int retClen, int start, int end, int
+*index) {
+     *  int i = blockIdx.x * blockDim.x + threadIdx.x + start;
+     * 	// Only slice if the index falls into the given range
+     * 	if(i < end && cl <= colInd[i] && colInd[i] <= cu) {
+     * 		ret[ index*retClen + (colInd[i] - cl) ] = inVal[i];
+     * 	}
+     * }
+     *
+     * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex];
+     * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL);
+     * slice_sparse_dense_row_helper<<< ceil(numThreads/
+*MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr,
+*colInd, ret,
+*			rl, ru, cl, cu, retClen, inRowPtr[rowIndex],
+*inRowPtr[rowIndex+1], index);
+*
+* Two-step compilation and linking process in JCudaKernels's constructor:
+* cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY,
+*"/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
+     */
+    // Iterate over elements of the row 'rowIndex'.
+    for (int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex + 1]; i++) {
+      // Only slice if the index falls into the given range
+      if (cl <= colInd[i] && colInd[i] <= cu) {
+        ret[index * retClen + (colInd[i] - cl)] = inVal[i];
+      }
     }
+  }
+}
+
+extern "C" __global__ void slice_sparse_dense_row_d(double *inVal, int *inRowPtr,
+                                                   int *colInd, double *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
+}
+
+extern "C" __global__ void slice_sparse_dense_row_f(float *inVal, int *inRowPtr,
+                                                   int *colInd, float *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 /**
- * Performs a slice operation where the input matrix is sparse and the output matrix is dense.
- * This function avoids unnecessary sparse to dense conversion of the input matrix.
+ * Performs a slice operation where the input matrix is sparse and the output
+ * matrix is dense.
+ * This function avoids unnecessary sparse to dense conversion of the input
+ * matrix.
  * Parallelization: subset of number of non-zeroes of input matrix.
- * 
+ *
  * @params inVal input val pointer
  * @params inRowPtr input row pointer
  * @params colInd input col index pointer
@@ -94,26 +135,42 @@ __global__ void slice_sparse_dense_row(double* inVal, int* inRowPtr, int* colInd
  * @param cu column upper
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_sparse_dense_nnz(double* inVal, int* inRowPtr, int* colInd, double* ret, 
-    int rl, int ru, int cl, int cu, int retClen) {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = tid + inRowPtr[rl];
-    
-    // Only slice if the index falls into the given range
-    if(i < inRowPtr[ru+1] && cl <= colInd[i] && colInd[i] <= cu) {
-    	// Find the row index for corresponding non-zero value 'i'.
-    	int rowIndex = rl;
-    	while(inRowPtr[rowIndex+1] <= i) {
-    		rowIndex++;
-    	}
-	    ret[ (rowIndex-rl)*retClen + (colInd[i] - cl) ] = inVal[i];
+template <typename T>
+__device__ void slice_sparse_dense_nnz(T *inVal, int *inRowPtr, int *colInd,
+                                       T *ret, int rl, int ru, int cl, int cu,
+                                       int retClen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = tid + inRowPtr[rl];
+
+  // Only slice if the index falls into the given range
+  if (i < inRowPtr[ru + 1] && cl <= colInd[i] && colInd[i] <= cu) {
+    // Find the row index for corresponding non-zero value 'i'.
+    int rowIndex = rl;
+    while (inRowPtr[rowIndex + 1] <= i) {
+      rowIndex++;
     }
+    ret[(rowIndex - rl) * retClen + (colInd[i] - cl)] = inVal[i];
+  }
+}
+
+extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal, int *inRowPtr,
+                                                   int *colInd, double *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
+}
+
+extern "C" __global__ void slice_sparse_dense_nnz_f(float *inVal, int *inRowPtr,
+                                                   int *colInd, float *ret,
+                                                   int rl, int ru, int cl,
+                                                   int cu, int retClen) {
+  slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 /**
- * Performs a slice operation where the input matrix is dense and the output matrix is dense.
- * 
+ * Performs a slice operation where the input matrix is dense and the output
+ * matrix is dense.
+ *
  * @params in dense input pointer
  * @params ret dense output pointer
  * @param rl row lower
@@ -124,17 +181,31 @@ __global__ void slice_sparse_dense_nnz(double* inVal, int* inRowPtr, int* colInd
  * @param retRlen number of rows of output matrix
  * @param retClen number of columns of output matrix
  */
-extern "C"
-__global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int cl, int cu, int inClen, int retRlen, int retClen) {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / retClen;
-	int iy = tid % retClen;
-	if(ix < retRlen && iy < retClen) {
-	    int inIndex = (ix + rl)*inClen + cl + iy;
-		ret[tid] = in[inIndex];
-	}
+template <typename T>
+__device__ void slice_dense_dense(T *in, T *ret, int rl, int ru, int cl, int cu,
+                                  int inClen, int retRlen, int retClen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / retClen;
+  int iy = tid % retClen;
+  if (ix < retRlen && iy < retClen) {
+    int inIndex = (ix + rl) * inClen + cl + iy;
+    ret[tid] = in[inIndex];
+  }
+}
+
+extern "C" __global__ void slice_dense_dense_d(double *in, double *ret, int rl,
+                                              int ru, int cl, int cu,
+                                              int inClen, int retRlen,
+                                              int retClen) {
+  slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
 }
 
+extern "C" __global__ void slice_dense_dense_f(float *in, float *ret, int rl,
+                                              int ru, int cl, int cu,
+                                              int inClen, int retRlen,
+                                              int retClen) {
+  slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
+}
 
 /**
  * Does a copy of upper to lower triangle of the given matrix
@@ -142,95 +213,161 @@ __global__ void slice_dense_dense(double* in, double* ret, int rl, int ru, int c
  * @param dim the number of rows of the square matrix ret
  * @param N total number of elements of the matrix
  */
-extern "C"
-__global__ void copy_u2l_dense(double* ret, int dim, int N) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / dim;
-	int iy = tid % dim;
-	int id_dest = iy * dim + ix;
-	if(iy > ix && id_dest < N) {
-		// TODO: Potential to reduce the number of threads by half
-		int id_src = tid;
-		ret[id_dest] = ret[id_src];
-	}
-}
-
-extern "C"
-__forceinline__ __device__ double getBoolean(int val) {
-	if(val == 0)
-		return 0.0;
-	else
-		return 1.0;
+template <typename T>
+__device__ void copy_u2l_dense(T *ret, int dim, int N) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / dim;
+  int iy = tid % dim;
+  int id_dest = iy * dim + ix;
+  if (iy > ix && id_dest < N) {
+    // TODO: Potential to reduce the number of threads by half
+    int id_src = tid;
+    ret[id_dest] = ret[id_src];
+  }
+}
+
+extern "C" __global__ void copy_u2l_dense_d(double *ret, int dim, int N) {
+  copy_u2l_dense(ret, dim, N);
+}
+
+extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) {
+  copy_u2l_dense(ret, dim, N);
+}
+
+// Use this method in templates to fetch the maximum value for a given datatype
+template <typename T>
+__forceinline__ __device__ T T_MAX(T x) {
+  return (T)DBL_MAX;
+}
+template <>
+__forceinline__ __device__ float T_MAX(float x) {
+  return FLT_MAX;
+}
+template <>
+__forceinline__ __device__ double T_MAX(double x) {
+  return DBL_MAX;
 }
 
 // op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
 // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=minus1multiply, 16=minusnz,
 // 17=modulus, 18=integer division}
-extern "C"
-__forceinline__ __device__ double binaryOp(double x, double y, int op) {
-	switch(op) {
-        case 0 : return x + y;
-        case 1 : return x - y;
-        case 2 : return x * y;
-        case 3 : return x / y;
-        case 4 : return pow(x, y);
-        case 5 : return getBoolean(x < y);
-        case 6 : return getBoolean(x <= y);
-        case 7 : return getBoolean(x > y);
-        case 8 : return getBoolean(x >= y);
-        case 9 : return getBoolean(x == y);
-        case 10 : return getBoolean(x != y);
-        case 11 : return min(x, y);
-        case 12 : return max(x, y);
-        case 13 : return getBoolean((int)llrint(x) & (int)llrint(y));
-        case 14 : return getBoolean((int)llrint(x) | (int)llrint(y));
-        case 15 : return 1 - x * y;
-        case 16 : return (x != 0.0 ? x - y : 0.0);
-        case 17 : {
-            if (y == 0.0 || y == -0.0){
-                return nan("");
-            }
-            double v = x / y;
-            // Check for v being NaN (v != v) or if it is infinity
-            if (isnan(v) || isinf(v)){
-                return v;
-            } else {
-                v = floor(v);
-            }
-            return x - v * y;
-        }
-        case 18:{
-            double v = x / y;
-            if (isnan(v) || isinf(v)){
-                return v;
-            } else {
-                return floor(v);
-            }
-        }
-        default : return DBL_MAX;
+template <typename T>
+__forceinline__ __device__ T binaryOp(T x, T y, int op) {
+  switch (op) {
+    case 0:
+      return x + y;
+    case 1:
+      return x - y;
+    case 2:
+      return x * y;
+    case 3:
+      return x / y;
+    case 4:
+      return pow(x, y);
+    case 5:
+      return (x < y) == 0 ? 0.0 : 1.0;
+    case 6:
+      return (x <= y) == 0 ? 0.0 : 1.0;
+    case 7:
+      return (x > y) == 0 ? 0.0 : 1.0;
+    case 8:
+      return (x >= y) == 0 ? 0.0 : 1.0;
+    case 9:
+      return (x == y) == 0 ? 0.0 : 1.0;
+    case 10:
+      return (x != y) == 0 ? 0.0 : 1.0;
+    case 11:
+      return min(x, y);
+    case 12:
+      return max(x, y);
+    case 13:
+      return ((int)llrint(x) & (int)llrint(y)) == 0 ? 0.0 : 1.0;
+    case 14:
+      return ((int)llrint(x) | (int)llrint(y)) == 0 ? 0.0 : 1.0;
+    case 15:
+      return 1 - x * y;
+    case 16:
+      return (x != 0.0 ? x - y : 0.0);
+    case 17: {
+      if (y == 0.0 || y == -0.0) {
+        return nan("");
+      }
+      T v = x / y;
+      // Check for v being NaN (v != v) or if it is infinity
+      if (isnan(v) || isinf(v)) {
+        return v;
+      } else {
+        v = floor(v);
+      }
+      return x - v * y;
     }
+    case 18: {
+      T v = x / y;
+      if (isnan(v) || isinf(v)) {
+        return v;
+      } else {
+        return floor(v);
+      }
+    }
+    default:
+      return T_MAX(x);
+  }
 }
 
-extern "C"
-__global__ void relu(double* A,  double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] = max(0.0, A[tid]);
-	}
+/**
+ * Performs forward pass for relu: ret = max(A, 0)
+ *
+ * @param A input array allocated on the GPU
+ * @param ret output array allocated on the GPU
+ * @param rlen the number of rows
+ * @param clen the number of columns
+ */
+template <typename T>
+__device__ void relu(T *A, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] = max(0.0, A[tid]);
+  }
+}
+
+extern "C" __global__ void relu_d(double *A, double *ret, int rlen, int clen) {
+  relu(A, ret, rlen, clen);
 }
 
-// This method computes the backpropagation errors for previous layer of relu operation
-extern "C"
-__global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] = X[tid] > 0 ?  dout[tid] : 0;
-	}
+extern "C" __global__ void relu_f(float *A, float *ret, int rlen, int clen) {
+  relu(A, ret, rlen, clen);
+}
+
+/**
+ * This method computes the backpropagation errors for previous layer of relu operation
+ *
+ * @param X input activation array allocated on the GPU
+ * @param dout errors from previous layer
+ * @param ret output array allocated on the GPU
+ * @param rlen the number of rows
+ * @param clen the number of columns
+ */
+template <typename T>
+__device__ void relu_backward(T *X, T *dout, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] = X[tid] > 0 ? dout[tid] : 0;
+  }
+}
+
+extern "C" __global__ void relu_backward_d(double *X, double *dout, double *ret,
+                                          int rlen, int clen) {
+  relu_backward(X, dout, ret, rlen, clen);
+}
+
+extern "C" __global__ void relu_backward_f(float *X, float *dout, float *ret,
+                                          int rlen, int clen) {
+  relu_backward(X, dout, ret, rlen, clen);
 }
 
 /**
@@ -241,81 +378,113 @@ __global__ void relu_backward(double* X,  double* dout, double* ret, int rlen, i
  * @param rlen the number of rows
  * @param clen the number of columns
  */
-extern "C"
-__global__ void inplace_add(double* input,  double* ret, int rlen, int clen) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		ret[tid] += input[tid];
-	}
+template <typename T>
+__device__ void inplace_add(T *input, T *ret, int rlen, int clen) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    ret[tid] += input[tid];
+  }
+}
+
+extern "C" __global__ void inplace_add_d(double *input, double *ret, int rlen,
+                                        int clen) {
+  inplace_add(input, ret, rlen, clen);
+}
+
+extern "C" __global__ void inplace_add_f(float *input, float *ret, int rlen,
+                                        int clen) {
+  inplace_add(input, ret, rlen, clen);
 }
 
 // Performs the operation corresponding to the DML script:
 // ones = matrix(1, rows=1, cols=Hout*Wout)
 // output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
-// This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
-extern "C"
-__global__ void bias_add(double* input,  double* bias, double* ret, int rlen, int clen, int PQ) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		int biasIndex = iy / PQ;
-		ret[tid] = input[tid] + bias[biasIndex];
-	}
+// This operation is often followed by conv2d and hence we have introduced
+// bias_add(input, bias) built-in function
+template <typename T>
+__device__ void bias_add(T *input, T *bias, T *ret, int rlen, int clen,
+                         int PQ) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    int biasIndex = iy / PQ;
+    ret[tid] = input[tid] + bias[biasIndex];
+  }
+}
+
+extern "C" __global__ void bias_add_d(double *input, double *bias, double *ret,
+                                     int rlen, int clen, int PQ) {
+  bias_add(input, bias, ret, rlen, clen, PQ);
+}
+
+extern "C" __global__ void bias_add_f(float *input, float *bias, float *ret,
+                                     int rlen, int clen, int PQ) {
+  bias_add(input, bias, ret, rlen, clen, PQ);
 }
 
 // Performs the operation "ret <- A + alpha*B", where B is a vector
-extern "C"
-__global__ void daxpy_matrix_vector(double* A,  double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clenA;
-	int iy = tid % clenA;
-	if(ix < rlenA && iy < clenA) {
-		int index = ix * clenA + iy;
-		if(rlenB == 1) {
-			ret[index] = A[index] + alpha*B[iy];
-		}
-		else {
-			ret[index] = A[index] + alpha*B[ix];
-		}
-	}
-}
-
-// Performs similar operation as bias_add except elementwise multiplication instead of add
-extern "C"
-__global__ void bias_multiply(double* input,  double* bias, double* ret, int rlen, int clen, int PQ) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	if(ix < rlen && iy < clen) {
-		int biasIndex = iy / PQ;
-		ret[tid] = input[tid] * bias[biasIndex];
-	}
-}
-
-// Compares the value and set
-extern "C"
-__global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / clen;
-	int iy = tid % clen;
-	int index = ix * clen + iy;
-	if(ix < rlen && iy < clen) {
-		if(abs(A[index]-compareVal) < tol)
-			ret[index] = ifEqualsVal;
-		else if(A[index] < compareVal)
-			ret[index] = ifLessThanVal;
-		else
-			ret[index] = ifGreaterThanVal;
-	}
+template <typename T>
+__device__ void daxpy_matrix_vector(T *A, T *B, double alpha, T *ret, int rlenA,
+                                    int clenA, int rlenB, int clenB) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clenA;
+  int iy = tid % clenA;
+  if (ix < rlenA && iy < clenA) {
+    int index = ix * clenA + iy;
+    if (rlenB == 1) {
+      ret[index] = A[index] + alpha * B[iy];
+    } else {
+      ret[index] = A[index] + alpha * B[ix];
+    }
+  }
 }
 
+extern "C" __global__ void daxpy_matrix_vector_d(double *A, double *B,
+                                                double alpha, double *ret,
+                                                int rlenA, int clenA, int rlenB,
+                                                int clenB) {
+  daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
+}
+
+extern "C" __global__ void daxpy_matrix_vector_f(float *A, float *B,
+                                                double alpha, float *ret,
+                                                int rlenA, int clenA, int rlenB,
+                                                int clenB) {
+  daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
+}
+
+// Performs similar operation as bias_add except elementwise multiplication
+// instead of add
+template <typename T>
+__device__ void bias_multiply(T *input, T *bias, T *ret, int rlen, int clen,
+                              int PQ) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / clen;
+  int iy = tid % clen;
+  if (ix < rlen && iy < clen) {
+    int biasIndex = iy / PQ;
+    ret[tid] = input[tid] * bias[biasIndex];
+  }
+}
+
+extern "C" __global__ void bias_multiply_d(double *input, double *bias,
+                                          double *ret, int rlen, int clen,
+                                          int PQ) {
+  bias_multiply(input, bias, ret, rlen, clen, PQ);
+}
+
+extern "C" __global__ void bias_multiply_f(float *input, float *bias, float *ret,
+                                          int rlen, int clen, int PQ) {
+  bias_multiply(input, bias, ret, rlen, clen, PQ);
+}
 
 /**
  * Performs a binary cellwise arithmetic operation on 2 matrices.
- * Either both matrices are of equal size or one of them is a vector or both are.
+ * Either both matrices are of equal size or one of them is a vector or both
+ * are.
  * @param A                 first input matrix allocated on GPU
  * @param B                 second input matrix allocated on GPU
  * @param C                 output allocated on GPU
@@ -323,37 +492,55 @@ __global__ void compare_and_set(double* A,  double* ret, int rlen, int clen, dou
  * @param maxClen           maximum of the column lengths of A and B
  * @param vectorAStatus     if A is a row vector, column vector or neither
  * @param vectorBStatus     if B is a row vector, column vector or neither
- * @param op                the numeric code of the arithmetic operation to perform
+ * @param op                the numeric code of the arithmetic operation to
+ * perform
  *
  */
-extern "C"
-__global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
-	int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) {
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	if(ix < maxRlen && iy < maxClen) {
-		int outIndex = ix * maxClen + iy;
-		int aIndex = outIndex;
-		int bIndex = outIndex;
-		if(vectorAStatus == 1)
-			aIndex = ix; // clen == 1
-		else if(vectorAStatus == 2)
-			aIndex = iy; // rlen == 1
-		if(vectorBStatus == 1)
-			bIndex = ix; // clen == 1
-		else if(vectorBStatus == 2)
-			bIndex = iy; // rlen == 1
-		C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
-		//printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
-	__syncthreads();
-	}
+template <typename T>
+__device__ void matrix_matrix_cellwise_op(T *A, T *B, T *C, int maxRlen,
+                                          int maxClen, int vectorAStatus,
+                                          int vectorBStatus, int op) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  if (ix < maxRlen && iy < maxClen) {
+    int outIndex = ix * maxClen + iy;
+    int aIndex = outIndex;
+    int bIndex = outIndex;
+    if (vectorAStatus == 1)
+      aIndex = ix;  // clen == 1
+    else if (vectorAStatus == 2)
+      aIndex = iy;  // rlen == 1
+    if (vectorBStatus == 1)
+      bIndex = ix;  // clen == 1
+    else if (vectorBStatus == 2)
+      bIndex = iy;  // rlen == 1
+    C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
+    // printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, aIndex,
+    // A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+    __syncthreads();
+  }
+}
+
+extern "C" __global__ void matrix_matrix_cellwise_op_d(
+    double *A, double *B, double *C, int maxRlen, int maxClen,
+    int vectorAStatus, int vectorBStatus, int op) {
+  matrix_matrix_cellwise_op(A, B, C, maxRlen, maxClen, vectorAStatus,
+                            vectorBStatus, op);
+}
+
+extern "C" __global__ void matrix_matrix_cellwise_op_f(
+    float *A, float *B, float *C, int maxRlen, int maxClen, int vectorAStatus,
+    int vectorBStatus, int op) {
+  matrix_matrix_cellwise_op(A, B, C, maxRlen, maxClen, vectorAStatus,
+                            vectorBStatus, op);
 }
 
 /**
  * Performs an arithmetic operation between a matrix and a scalar.
- * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is the operation)
+ * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is
+ * the operation)
  * @param A             input matrix allocated on GPU
  * @param scalar        scalar input
  * @param C             output matrix allocated on GPU
@@ -361,32 +548,53 @@ __global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
  * @param op            number code of the arithmetic operation to perform
  * @param isLeftScalar  whether the scalar is on the left side
  */
-extern "C"
-__global__ void matrix_scalar_op(double* A, double scalar, double* C, int size, int op, int isLeftScalar) {
-	int index = blockIdx.x *blockDim.x + threadIdx.x;
-	if(index < size) {
-		if(isLeftScalar) {
-			C[index] = binaryOp(scalar, A[index], op);
-		} else {
-			C[index] = binaryOp(A[index], scalar, op);
-		}
-	}
-	__syncthreads();
+template <typename T>
+__device__ void matrix_scalar_op(T *A, T scalar, T *C, int size, int op,
+                                 int isLeftScalar) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    if (isLeftScalar) {
+      C[index] = binaryOp(scalar, A[index], op);
+    } else {
+      C[index] = binaryOp(A[index], scalar, op);
+    }
+  }
+  __syncthreads();
 }
 
+extern "C" __global__ void matrix_scalar_op_d(double *A, double scalar,
+                                             double *C, int size, int op,
+                                             int isLeftScalar) {
+  matrix_scalar_op(A, scalar, C, size, op, isLeftScalar);
+}
+
+extern "C" __global__ void matrix_scalar_op_f(float *A, double scalar, float *C,
+                                             int size, int op,
+                                             int isLeftScalar) {
+  matrix_scalar_op(A, (float)scalar, C, size, op, isLeftScalar);
+}
 
 /**
- * Sets all elements (fills) of a double array of given length with a given scalar value
+ * Sets all elements (fills) of a double array of given length with a given
+ * scalar value
  * @param A         array to be filled
  * @param scalar    value to fill array with
  * @param lenA      length of array A
  */
-extern "C"
-__global__ void fill(double* A, double scalar, int lenA) {
+template <typename T>
+__device__ void fill(T *A, T scalar, int lenA) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-	if (index < lenA){
-	    A[index] = scalar;
-	}
+  if (index < lenA) {
+    A[index] = scalar;
+  }
+}
+
+extern "C" __global__ void fill_d(double *A, double scalar, int lenA) {
+  fill(A, scalar, lenA);
+}
+
+extern "C" __global__ void fill_f(float *A, double scalar, int lenA) {
+  fill(A, (float)scalar, lenA);
 }
 
 /**
@@ -402,29 +610,39 @@ __global__ void fill(double* A, double scalar, int lenA) {
  * @param rowsB  rows in B
  * @param colsB  columns in B
  */
-extern "C"
-__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
-	int maxClen = max(colsA, colsB);
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	int colsC = colsA + colsB;
-	int rowsC = rowsA;
-
-	// Copy an element of A into C into the appropriate location
-	if (ix < rowsA && iy < colsA) {
-		double elemA = A[ix * colsA + iy];
-		C[ix * colsC + iy] = elemA;
-	}
+template <typename T>
+__device__ void cbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
+                      int colsB) {
+  int maxClen = max(colsA, colsB);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  int colsC = colsA + colsB;
+  int rowsC = rowsA;
+
+  // Copy an element of A into C into the appropriate location
+  if (ix < rowsA && iy < colsA) {
+    T elemA = A[ix * colsA + iy];
+    C[ix * colsC + iy] = elemA;
+  }
+
+  // Copy an element of B into C into the appropriate location
+  if (ix < rowsB && iy < colsB) {
+    T elemB = B[ix * colsB + iy];
+    C[ix * colsC + (iy + colsA)] = elemB;
+  }
+}
 
-	// Copy an element of B into C into the appropriate location
-	if (ix < rowsB && iy < colsB) {
-		double elemB = B[ix * colsB + iy];
-		C[ix * colsC + (iy + colsA)] = elemB;
-	}
+extern "C" __global__ void cbind_d(double *A, double *B, double *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  cbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
+extern "C" __global__ void cbind_f(float *A, float *B, float *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  cbind(A, B, C, rowsA, colsA, rowsB, colsB);
+}
 
 /**
  * Appends Matrix B to the bottom of Matrix A into a new matrix C
@@ -441,176 +659,263 @@ __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int
  * @param rowsB  rows in B
  * @param colsB  columns in B
  */
-extern "C"
-__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
-	int maxClen = max(colsA, colsB);
-	int tid = blockIdx.x * blockDim.x + threadIdx.x;
-	int ix = tid / maxClen;
-	int iy = tid % maxClen;
-
-	int rowsC = rowsA + rowsB;
-	int colsC = colsA;
-
-	// Copy an element of A into C into the appropriate location
-	if (ix < rowsA && iy < colsA) {
-		double elemA = A[ix * colsA + iy];
-		C[ix * colsC + iy] = elemA;
-	}
+template <typename T>
+__device__ void rbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
+                      int colsB) {
+  int maxClen = max(colsA, colsB);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int ix = tid / maxClen;
+  int iy = tid % maxClen;
+
+  int rowsC = rowsA + rowsB;
+  int colsC = colsA;
+
+  // Copy an element of A into C into the appropriate location
+  if (ix < rowsA && iy < colsA) {
+    T elemA = A[ix * colsA + iy];
+    C[ix * colsC + iy] = elemA;
+  }
+
+  // Copy an element of B into C into the appropriate location
+  if (ix < rowsB && iy < colsB) {
+    T elemB = B[ix * colsB + iy];
+    C[(ix + rowsA) * colsC + iy] = elemB;
+  }
+}
 
-	// Copy an element of B into C into the appropriate location
-	if (ix < rowsB && iy < colsB) {
-		double elemB = B[ix * colsB + iy];
-		C[(ix + rowsA) * colsC + iy] = elemB;
-	}
+extern "C" __global__ void rbind_d(double *A, double *B, double *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  rbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
+extern "C" __global__ void rbind_f(float *A, float *B, float *C, int rowsA,
+                                  int colsA, int rowsB, int colsB) {
+  rbind(A, B, C, rowsA, colsA, rowsB, colsB);
+}
 
 /**
  * Does a reduce operation over all elements of the array.
- * This method has been adapted from the Reduction sample in the NVIDIA CUDA Samples (v8.0)
+ * This method has been adapted from the Reduction sample in the NVIDIA CUDA
+ * Samples (v8.0)
  * and the Reduction example available through jcuda.org
- * When invoked initially, all blocks partly compute the reduction operation over the entire array
- * and writes it to the output/temporary array. A second invokation needs to happen to get the
+ * When invoked initially, all blocks partly compute the reduction operation
+ * over the entire array
+ * and writes it to the output/temporary array. A second invokation needs to
+ * happen to get the
  * reduced value.
- * The number of threads, blocks and amount of shared memory is calculated in a specific way.
- * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes this method to see
+ * The number of threads, blocks and amount of shared memory is calculated in a
+ * specific way.
+ * Please refer to the NVIDIA CUDA Sample or the SystemML code that invokes this
+ * method to see
  * how its done.
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
  *
- * @param ReductionOp       Type of the functor object that implements the reduction operation
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
  */
-template <typename ReductionOp>
+template <typename ReductionOp, typename T>
 __device__ void reduce(
-    double *g_idata,            ///< input data stored in device memory (of size n)
-    double *g_odata,            ///< output/temporary array stored in device memory (of size n)
-    unsigned int n,             ///< size of the input and temporary/output arrays
-    ReductionOp reduction_op,	///< Reduction operation to perform (functor object)
-	double initialValue)  		///< initial value for the reduction variable
+    T *g_idata,  ///< input data stored in device memory (of size n)
+    T *g_odata,  ///< output/temporary array stored in device memory (of size n)
+    unsigned int n,  ///< size of the input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    T initialValue)    ///< initial value for the reduction variable
 {
-    extern __shared__ double sdata[];
-
-    // perform first level of reduction,
-    // reading from global memory, writing to shared memory
-    unsigned int tid = threadIdx.x;
-    unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
-    unsigned int gridSize = blockDim.x*2*gridDim.x;
-
-    double v = initialValue;
-
-    // we reduce multiple elements per thread.  The number is determined by the
-    // number of active thread blocks (via gridDim).  More blocks will result
-    // in a larger gridSize and therefore fewer elements per thread
-    while (i < n)
-    {
-        v = reduction_op(v, g_idata[i]);
-        // ensure we don't read out of bounds
-        if (i + blockDim.x < n)
-            v = reduction_op(v, g_idata[i+blockDim.x]);
-        i += gridSize;
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+  unsigned int gridSize = blockDim.x * 2 * gridDim.x;
+
+  T v = initialValue;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    v = reduction_op(v, g_idata[i]);
+    // ensure we don't read out of bounds
+    if (i + blockDim.x < n) v = reduction_op(v, g_idata[i + blockDim.x]);
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
     }
-
-    // each thread puts its local sum into shared memory
-    sdata[tid] = v;
     __syncthreads();
-
-
-    // do reduction in shared mem
-		if (blockDim.x >= 1024){ if (tid < 512) { sdata[tid] = v = reduction_op(v, sdata[tid + 512]); } __syncthreads(); }
-    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = v = reduction_op(v, sdata[tid + 256]); } __syncthreads(); }
-    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = v = reduction_op(v, sdata[tid + 128]); } __syncthreads(); }
-    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = v = reduction_op(v, sdata[tid +  64]); } __syncthreads(); }
-
-    if (tid < 32)
-    {
-        // now that we are using warp-synchronous programming (below)
-        // we need to declare our shared memory volatile so that the compiler
-        // doesn't reorder stores to it and induce incorrect behavior.
-        volatile double* smem = sdata;
-        if (blockDim.x >=  64) { smem[tid] = v = reduction_op(v, smem[tid + 32]); }
-        if (blockDim.x >=  32) { smem[tid] = v = reduction_op(v, smem[tid + 16]); }
-        if (blockDim.x >=  16) { smem[tid] = v = reduction_op(v, smem[tid +  8]); }
-        if (blockDim.x >=   8) { smem[tid] = v = reduction_op(v, smem[tid +  4]); }
-        if (blockDim.x >=   4) { smem[tid] = v = reduction_op(v, smem[tid +  2]); }
-        if (blockDim.x >=   2) { smem[tid] = v = reduction_op(v, smem[tid +  1]); }
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
     }
+    __syncthreads();
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = reduction_op(v, smem[tid + 32]);
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = reduction_op(v, smem[tid + 16]);
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = reduction_op(v, smem[tid + 8]);
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = reduction_op(v, smem[tid + 4]);
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = reduction_op(v, smem[tid + 2]);
+    }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = reduction_op(v, smem[tid + 1]);
+    }
+  }
 
-    // write result for this block to global mem
-    if (tid == 0)
-        g_odata[blockIdx.x] = sdata[0];
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 }
 
-
-
 /**
  * Does a reduce (sum) over each row of the array.
  * This kernel must be launched with as many blocks as there are rows.
- * The intuition for this kernel is that each block does a reduction over a single row.
- * The maximum number of blocks that can launched (as of compute capability 3.0) is 2^31 - 1
- * This works out fine for SystemML, since the maximum elements in a Java array can be 2^31 - c (some small constant)
- * If the matrix is "fat" and "short", i.e. there are small number of rows and a large number of columns,
+ * The intuition for this kernel is that each block does a reduction over a
+ * single row.
+ * The maximum number of blocks that can launched (as of compute capability 3.0)
+ * is 2^31 - 1
+ * This works out fine for SystemML, since the maximum elements in a Java array
+ * can be 2^31 - c (some small constant)
+ * If the matrix is "fat" and "short", i.e. there are small number of rows and a
+ * large number of columns,
  * there could be under-utilization of the hardware.
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
- * @param ReductionOp       Type of the functor object that implements the reduction operation
- * @param AssignmentOp      Type of the functor object that is used to modify the value before writing it to its final location in global memory for each row
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * row
  */
-template <typename ReductionOp,
-          typename AssignmentOp>
+template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_row(
-    double *g_idata,            ///< input data stored in device memory (of size rows*cols)
-    double *g_odata,            ///< output/temporary array store in device memory (of size rows*cols)
-    unsigned int rows,          ///< rows in input and temporary/output arrays
-    unsigned int cols,          ///< columns in input and temporary/output arrays
-    ReductionOp reduction_op,		///< Reduction operation to perform (functor object)
-    AssignmentOp assignment_op, ///< Operation to perform before assigning this to its final location in global memory for each row
-    double initialValue){  			///< initial value for the reduction variable
-    extern __shared__ double sdata[];
-
-    // one block per row
-    if (blockIdx.x >= rows) {
-        return;
+    T *g_idata,  ///< input data stored in device memory (of size rows*cols)
+    T *g_odata,  ///< output/temporary array store in device memory (of size
+                 ///rows*cols)
+    unsigned int rows,  ///< rows in input and temporary/output arrays
+    unsigned int cols,  ///< columns in input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    AssignmentOp assignment_op,  ///< Operation to perform before assigning this
+                                 ///to its final location in global memory for
+                                 ///each row
+    T initialValue) {            ///< initial value for the reduction variable
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // one block per row
+  if (blockIdx.x >= rows) {
+    return;
+  }
+
+  unsigned int block = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  unsigned int i = tid;
+  unsigned int block_offset = block * cols;
+
+  T v = initialValue;
+  while (i < cols) {
+    v = reduction_op(v, g_idata[block_offset + i]);
+    i += blockDim.x;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 512]);
     }
-
-    unsigned int block = blockIdx.x;
-    unsigned int tid = threadIdx.x;
-    unsigned int i = tid;
-    unsigned int block_offset = block * cols;
-
-    double v = initialValue;
-    while (i < cols){
-        v = reduction_op(v, g_idata[block_offset + i]);
-        i += blockDim.x;
+    __syncthreads();
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 256]);
     }
-
-    // each thread puts its local sum into shared memory
-    sdata[tid] = v;
     __syncthreads();
-
- 		// do reduction in shared mem
-  	if (blockDim.x >= 1024){ if (tid < 512) { sdata[tid] = v = reduction_op(v, sdata[tid + 512]); } __syncthreads(); }
-    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = v = reduction_op(v, sdata[tid + 256]); } __syncthreads(); }
-    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = v = reduction_op(v, sdata[tid + 128]); } __syncthreads(); }
-    if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = v = reduction_op(v, sdata[tid +  64]); } __syncthreads(); }
-
-    if (tid < 32)
-    {
-        // now that we are using warp-synchronous programming (below)
-        // we need to declare our shared memory volatile so that the compiler
-        // doesn't reorder stores to it and induce incorrect behavior.
-        volatile double* smem = sdata;
-        if (blockDim.x >=  64) { smem[tid] = v = reduction_op(v, smem[tid + 32]); }
-        if (blockDim.x >=  32) { smem[tid] = v = reduction_op(v, smem[tid + 16]); }
-        if (blockDim.x >=  16) { smem[tid] = v = reduction_op(v, smem[tid +  8]); }
-        if (blockDim.x >=   8) { smem[tid] = v = reduction_op(v, smem[tid +  4]); }
-        if (blockDim.x >=   4) { smem[tid] = v = reduction_op(v, smem[tid +  2]); }
-        if (blockDim.x >=   2) { smem[tid] = v = reduction_op(v, smem[tid +  1]); }
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 128]);
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = reduction_op(v, sdata[tid + 64]);
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = reduction_op(v, smem[tid + 32]);
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = reduction_op(v, smem[tid + 16]);
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = reduction_op(v, smem[tid + 8]);
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = reduction_op(v, smem[tid + 4]);
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = reduction_op(v, smem[tid + 2]);
     }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = reduction_op(v, smem[tid + 1]);
+    }
+  }
 
-    // write result for this block to global mem, modify it with assignment op
-    if (tid == 0)
-        g_odata[block] = assignment_op(sdata[0]);
+  // write result for this block to global mem, modify it with assignment op
+  if (tid == 0) g_odata[block] = assignment_op(sdata[0]);
 }
 
-
 /**
  * Does a column wise reduction.
  * The intuition is that there are as many global threads as there are columns
@@ -618,57 +923,59 @@ __device__ void reduce_row(
  * This of course leads to a under-utilization of the GPU resources.
  * For cases, where the number of columns is small, there can be unused SMs
  *
- * The template-ized version of this function is similar to what is found in NVIDIA CUB
- * @param ReductionOp       Type of the functor object that implements the reduction operation
- * @param AssignmentOp      Type of the functor object that is used to modify the value before writing it to its final location in global memory for each column
+ * The template-ized version of this function is similar to what is found in
+ * NVIDIA CUB
+ * @param ReductionOp       Type of the functor object that implements the
+ * reduction operation
+ * @param AssignmentOp      Type of the functor object that is used to modify
+ * the value before writing it to its final location in global memory for each
+ * column
  */
-template <typename ReductionOp,
-          typename AssignmentOp>
+template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_col(
-    double *g_idata,            ///< input data stored in device memory (of size rows*cols)
-    double *g_odata,            ///< output/temporary array store in device memory (of size rows*cols)
-    unsigned int rows,          ///< rows in input and temporary/output arrays
-    unsigned int cols,          ///< columns in input and temporary/output arrays
-    ReductionOp reduction_op,	///< Reduction operation to perform (functor object)
-    AssignmentOp assignment_op, ///< Operation to perform before assigning this to its final location in global memory for each column
-    double initialValue)  		///< initial value for the reduction variable
+    T *g_idata,  ///< input data stored in device memory (of size rows*cols)
+    T *g_odata,  ///< output/temporary array store in device memory (of size
+                 ///rows*cols)
+    unsigned int rows,  ///< rows in input and temporary/output arrays
+    unsigned int cols,  ///< columns in input and temporary/output arrays
+    ReductionOp
+        reduction_op,  ///< Reduction operation to perform (functor object)
+    AssignmentOp assignment_op,  ///< Operation to perform before assigning this
+                                 ///to its final location in global memory for
+                                 ///each column
+    T initialValue)              ///< initial value for the reduction variable
 {
-    unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (global_tid >= cols) {
-        return;
-    }
-
-    unsigned int i = global_tid;
-    unsigned int grid_size = cols;
-    double val = initialValue;
-
-    while (i < rows * cols) {
-      val = reduction_op(val, g_idata[i]);
-      i += grid_size;
-    }
-    g_odata[global_tid] = assignment_op(val);
+  unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (global_tid >= cols) {
+    return;
+  }
+
+  unsigned int i = global_tid;
+  unsigned int grid_size = cols;
+  T val = initialValue;
+
+  while (i < rows * cols) {
+    val = reduction_op(val, g_idata[i]);
+    i += grid_size;
+  }
+  g_odata[global_tid] = assignment_op(val);
 }
 
 /**
  * Functor op for assignment op. This is a dummy/identity op.
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a) const {
-        return a;
-    }
-} IdentityOp;
+template <typename T>
+struct IdentityOp {
+  __device__ __forceinline__ T operator()(T a) const { return a; }
+};
 
 /**
  * Functor op for summation operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return a + b;
-    }
-} SumOp;
-
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
 
 /**
  * Do a summation over all elements of an array/matrix
@@ -676,10 +983,20 @@ typedef struct {
  * @param g_odata   output/temporary array stored in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_sum(double *g_idata, double *g_odata, unsigned int n){
-	SumOp op;
-  reduce<SumOp>(g_idata, g_odata, n, op, 0.0);
+template <typename T>
+__device__ void reduce_sum(T *g_idata, T *g_odata, unsigned int n) {
+  SumOp<T> op;
+  reduce<SumOp<T>, T>(g_idata, g_odata, n, op, (T)0.0);
+}
+
+extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_sum(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_sum(g_idata, g_odata, n);
 }
 
 /**
@@ -689,11 +1006,25 @@ __global__ void reduce_sum(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_sum(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    IdentityOp aop;
-    reduce_row<SumOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_row_sum(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  SumOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<SumOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         0.0);
+}
+
+extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_sum(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_sum(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -703,23 +1034,39 @@ __global__ void reduce_row_sum(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_sum(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    IdentityOp aop;
-    reduce_col<SumOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_col_sum(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  SumOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<SumOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         (T)0.0);
+}
+
+extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_sum(g_idata, g_odata, rows, cols);
 }
 
+extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_sum(g_idata, g_odata, rows, cols);
+}
 
 /**
  * Functor op for max operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return fmax(a, b);
-    }
-} MaxOp;
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return fmax(a, b); }
+};
+
+template<>
+struct MaxOp<float> {
+  __device__ __forceinline__ float operator()(float a, float b) const { return fmaxf(a, b); }
+};
 
 
 /**
@@ -728,10 +1075,20 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
-    MaxOp op;
-    reduce<MaxOp>(g_idata, g_odata, n, op, -DBL_MAX);
+template <typename T>
+__device__ void reduce_max(T *g_idata, T *g_odata, unsigned int n) {
+  MaxOp<T> op;
+  reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_max(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_max(g_idata, g_odata, n);
 }
 
 /**
@@ -741,11 +1098,25 @@ __global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MaxOp op;
-    IdentityOp aop;
-    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
+template <typename T>
+__device__ void reduce_row_max(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MaxOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         -T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_max(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -755,22 +1126,34 @@ __global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_max(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MaxOp op;
-    IdentityOp aop;
-    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, -DBL_MAX);
+template <typename T>
+__device__ void reduce_col_max(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MaxOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         (T)-T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_max(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_max(g_idata, g_odata, rows, cols);
 }
 
 /**
  * Functor op for min operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return fmin(a, b);
-    }
-} MinOp;
+template <typename T>
+struct MinOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return fmin(a, b); }
+};
 
 /**
  * Do a min over all elements of an array/matrix
@@ -778,10 +1161,20 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_min(double *g_idata, double *g_odata, unsigned int n){
-	MinOp op;
-    reduce<MinOp>(g_idata, g_odata, n, op, DBL_MAX);
+template <typename T>
+__device__ void reduce_min(T *g_idata, T *g_odata, unsigned int n) {
+  MinOp<T> op;
+  reduce<MinOp<T>, T>(g_idata, g_odata, n, op, T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata,
+                                       unsigned int n) {
+  reduce_min(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata,
+                                       unsigned int n) {
+  reduce_min(g_idata, g_odata, n);
 }
 
 /**
@@ -791,11 +1184,25 @@ __global__ void reduce_min(double *g_idata, double *g_odata, unsigned int n){
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_min(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MinOp op;
-    IdentityOp aop;
-    reduce_row<MinOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MAX);
+template <typename T>
+__device__ void reduce_row_min(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MinOp<T> op;
+  IdentityOp<T> aop;
+  reduce_row<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_row_min(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -805,22 +1212,34 @@ __global__ void reduce_row_min(double *g_idata, double *g_odata, unsigned int ro
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_min(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    MinOp op;
-    IdentityOp aop;
-    reduce_col<MinOp>(g_idata, g_odata, rows, cols, op, aop, DBL_MAX);
+template <typename T>
+__device__ void reduce_col_min(T *g_idata, T *g_odata, unsigned int rows,
+                               unsigned int cols) {
+  MinOp<T> op;
+  IdentityOp<T> aop;
+  reduce_col<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                         T_MAX(g_idata[0]));
+}
+
+extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_min(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata,
+                                           unsigned int rows,
+                                           unsigned int cols) {
+  reduce_col_min(g_idata, g_odata, rows, cols);
 }
 
 /**
  * Functor op for product operation
  */
-typedef struct {
-    __device__ __forceinline__
-    double operator()(double a, double b) const {
-        return a * b;
-    }
-} ProductOp;
+template <typename T>
+struct ProductOp {
+  __device__ __forceinline__ T operator()(T a, T b) const { return a * b; }
+};
 
 /**
  * Do a product over all elements of an array/matrix
@@ -828,26 +1247,35 @@ typedef struct {
  * @param g_odata   output/temporary array stode in device memory (of size n)
  * @param n         size of the input and temporary/output arrays
  */
-extern "C"
-__global__ void reduce_prod(double *g_idata, double *g_odata, unsigned int n){
-	ProductOp op;
-    reduce<ProductOp>(g_idata, g_odata, n, op, 1.0);
+template <typename T>
+__device__ void reduce_prod(T *g_idata, T *g_odata, unsigned int n) {
+  ProductOp<T> op;
+  reduce<ProductOp<T>, T>(g_idata, g_odata, n, op, (T)1.0);
+}
+
+extern "C" __global__ void reduce_prod_d(double *g_idata, double *g_odata,
+                                        unsigned int n) {
+  reduce_prod(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void reduce_prod_f(float *g_idata, float *g_odata,
+                                        unsigned int n) {
+  reduce_prod(g_idata, g_odata, n);
 }
 
 /**
  * Functor op for mean operation
  */
+template <typename T>
 struct MeanOp {
-    const long _size;   ///< Number of elements by which to divide to calculate mean
-		__device__ __forceinline__
-    MeanOp(long size): _size(size) {}
-    __device__ __forceinline__
-    double operator()(double total) const {
-        return total / _size;
-    }
+  const long
+      _size;  ///< Number of elements by which to divide to calculate mean
+  __device__ __forceinline__ MeanOp(long size) : _size(size) {}
+  __device__ __forceinline__ T operator()(T total) const {
+    return total / _size;
+  }
 };
 
-
 /**
  * Do a mean over all rows of a matrix
  * @param g_idata   input matrix stored in device memory (of size rows * cols)
@@ -855,11 +1283,25 @@ struct MeanOp {
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    MeanOp aop(cols);
-    reduce_row<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_row_mean(T *g_idata, T *g_odata, unsigned int rows,
+                                unsigned int cols) {
+  SumOp<T> op;
+  MeanOp<T> aop(cols);
+  reduce_row<SumOp<T>, MeanOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                     (T)0.0);
+}
+
+extern "C" __global__ void reduce_row_mean_d(double *g_idata, double *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_row_mean(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_row_mean_f(float *g_idata, float *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_row_mean(g_idata, g_odata, rows, cols);
 }
 
 /**
@@ -869,13 +1311,26 @@ __global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int r
  * @param rows      number of rows in input matrix
  * @param cols      number of columns in input matrix
  */
-extern "C"
-__global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int rows, unsigned int cols){
-    SumOp op;
-    MeanOp aop(rows);
-    reduce_col<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
+template <typename T>
+__device__ void reduce_col_mean(T *g_idata, T *g_odata, unsigned int rows,
+                                unsigned int cols) {
+  SumOp<T> op;
+  MeanOp<T> aop(rows);
+  reduce_col<SumOp<T>, MeanOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
+                                     0.0);
 }
 
+extern "C" __global__ void reduce_col_mean_d(double *g_idata, double *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_col_mean(g_idata, g_odata, rows, cols);
+}
+
+extern "C" __global__ void reduce_col_mean_f(float *g_idata, float *g_odata,
+                                            unsigned int rows,
+                                            unsigned int cols) {
+  reduce_col_mean(g_idata, g_odata, rows, cols);
+}
 
 /**
  * Do an exp over all the elements of a matrix
@@ -883,12 +1338,21 @@ __global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int r
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_exp(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = exp(A[index]);
-    }
+template <typename T>
+__device__ void matrix_exp(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = exp(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_exp_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_exp(A, C, size);
+}
+
+extern "C" __global__ void matrix_exp_f(float *A, float *C, unsigned int size) {
+  matrix_exp(A, C, size);
 }
 
 /**
@@ -897,12 +1361,21 @@ __global__ void matrix_exp(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sqrt(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sqrt(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sqrt(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sqrt_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sqrt(A, C, size);
+}
+
+extern "C" __global__ void matrix_sqrt_f(float *A, float *C, unsigned int size) {
+  matrix_sqrt(A, C, size);
 }
 
 /**
@@ -911,12 +1384,22 @@ __global__ void matrix_sqrt(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_round(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = (double)llround(A[index]);
-    }
+template <typename T>
+__device__ void matrix_round(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = (T)llround(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_round_d(double *A, double *C,
+                                         unsigned int size) {
+  matrix_round(A, C, size);
+}
+
+extern "C" __global__ void matrix_round_f(float *A, float *C,
+                                         unsigned int size) {
+  matrix_round(A, C, size);
 }
 
 /**
@@ -925,12 +1408,21 @@ __global__ void matrix_round(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_abs(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = (double)fabs(A[index]);
-    }
+template <typename T>
+__device__ void matrix_abs(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = (T)fabs(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_abs_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_abs(A, C, size);
+}
+
+extern "C" __global__ void matrix_abs_f(float *A, float *C, unsigned int size) {
+  matrix_abs(A, C, size);
 }
 
 /**
@@ -939,12 +1431,21 @@ __global__ void matrix_abs(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_log(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = log(A[index]);
-    }
+template <typename T>
+__device__ void matrix_log(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = log(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_log_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_log(A, C, size);
+}
+
+extern "C" __global__ void matrix_log_f(float *A, float *C, unsigned int size) {
+  matrix_log(A, C, size);
 }
 
 /**
@@ -953,12 +1454,22 @@ __global__ void matrix_log(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_floor(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = floor(A[index]);
-    }
+template <typename T>
+__device__ void matrix_floor(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = floor(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_floor_d(double *A, double *C,
+                                         unsigned int size) {
+  matrix_floor(A, C, size);
+}
+
+extern "C" __global__ void matrix_floor_f(float *A, float *C,
+                                         unsigned int size) {
+  matrix_floor(A, C, size);
 }
 
 /**
@@ -967,12 +1478,21 @@ __global__ void matrix_floor(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_ceil(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = ceil(A[index]);
-    }
+template <typename T>
+__device__ void matrix_ceil(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = ceil(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_ceil_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_ceil(A, C, size);
+}
+
+extern "C" __global__ void matrix_ceil_f(float *A, float *C, unsigned int size) {
+  matrix_ceil(A, C, size);
 }
 
 /**
@@ -981,12 +1501,21 @@ __global__ void matrix_ceil(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sin(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sin(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sin(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sin(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sin_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_sin(A, C, size);
+}
+
+extern "C" __global__ void matrix_sin_f(float *A, float *C, unsigned int size) {
+  matrix_sin(A, C, size);
 }
 
 /**
@@ -995,12 +1524,21 @@ __global__ void matrix_sin(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sinh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = sinh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_sinh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = sinh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_sinh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sinh(A, C, size);
+}
+
+extern "C" __global__ void matrix_sinh_f(float *A, float *C, unsigned int size) {
+  matrix_sinh(A, C, size);
 }
 
 /**
@@ -1009,12 +1547,21 @@ __global__ void matrix_sinh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_cos(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = cos(A[index]);
-    }
+template <typename T>
+__device__ void matrix_cos(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = cos(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_cos_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_cos(A, C, size);
+}
+
+extern "C" __global__ void matrix_cos_f(float *A, float *C, unsigned int size) {
+  matrix_cos(A, C, size);
 }
 
 /**
@@ -1023,12 +1570,21 @@ __global__ void matrix_cos(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_cosh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = cosh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_cosh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = cosh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_cosh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_cosh(A, C, size);
+}
+
+extern "C" __global__ void matrix_cosh_f(float *A, float *C, unsigned int size) {
+  matrix_cosh(A, C, size);
 }
 
 /**
@@ -1037,12 +1593,21 @@ __global__ void matrix_cosh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_tan(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = tan(A[index]);
-    }
+template <typename T>
+__device__ void matrix_tan(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = tan(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_tan_d(double *A, double *C,
+                                       unsigned int size) {
+  matrix_tan(A, C, size);
+}
+
+extern "C" __global__ void matrix_tan_f(float *A, float *C, unsigned int size) {
+  matrix_tan(A, C, size);
 }
 
 /**
@@ -1051,12 +1616,21 @@ __global__ void matrix_tan(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_tanh(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = tanh(A[index]);
-    }
+template <typename T>
+__device__ void matrix_tanh(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = tanh(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_tanh_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_tanh(A, C, size);
+}
+
+extern "C" __global__ void matrix_tanh_f(float *A, float *C, unsigned int size) {
+  matrix_tanh(A, C, size);
 }
 
 /**
@@ -1065,12 +1639,21 @@ __global__ void matrix_tanh(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_asin(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = asin(A[index]);
-    }
+template <typename T>
+__device__ void matrix_asin(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = asin(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_asin_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_asin(A, C, size);
+}
+
+extern "C" __global__ void matrix_asin_f(float *A, float *C, unsigned int size) {
+  matrix_asin(A, C, size);
 }
 
 /**
@@ -1079,12 +1662,21 @@ __global__ void matrix_asin(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_acos(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = acos(A[index]);
-    }
+template <typename T>
+__device__ void matrix_acos(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = acos(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_acos_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_acos(A, C, size);
+}
+
+extern "C" __global__ void matrix_acos_f(float *A, float *C, unsigned int size) {
+  matrix_acos(A, C, size);
 }
 
 /**
@@ -1093,12 +1685,21 @@ __global__ void matrix_acos(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_atan(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        C[index] = atan(A[index]);
-    }
+template <typename T>
+__device__ void matrix_atan(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    C[index] = atan(A[index]);
+  }
+}
+
+extern "C" __global__ void matrix_atan_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_atan(A, C, size);
+}
+
+extern "C" __global__ void matrix_atan_f(float *A, float *C, unsigned int size) {
+  matrix_atan(A, C, size);
 }
 
 /**
@@ -1108,14 +1709,23 @@ __global__ void matrix_atan(double *A, double *C, unsigned int size) {
  * @param C the pre-allocated output matrix (of length = size)
  * @param siz the length of the input and output matrices
  */
-extern "C"
-__global__ void matrix_sign(double *A, double *C, unsigned int size) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < size){
-        if (A[index] == 0.0) {
-            C[index] = 0.0;
-        } else {
-            C[index] = copysign(1.0, A[index]);
-        }
+template <typename T>
+__device__ void matrix_sign(T *A, T *C, unsigned int size) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    if (A[index] == 0.0) {
+      C[index] = 0.0;
+    } else {
+      C[index] = copysign(1.0, A[index]);
     }
+  }
+}
+
+extern "C" __global__ void matrix_sign_d(double *A, double *C,
+                                        unsigned int size) {
+  matrix_sign(A, C, size);
 }
+
+extern "C" __global__ void matrix_sign_f(float *A, float *C, unsigned int size) {
+  matrix_sign(A, C, size);
+}
\ No newline at end of file

[39/50] [abbrv] systemml git commit: [SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

Posted by re...@apache.org.

[SYSTEMML-540] Avoid redundant computation of cudnnPoolingForward in max_pool_backward

- If the max_pool is invoked in the forward pass, then its output can be
  reused by the max_pool_backward rather than calling cudnnPoolingForward
  again. For sentence CNN with 2 epochs, this reduces the time for
  max_pool_backward from 6.361 to 2.966 seconds.

Closes #691.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/06d5bb07
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/06d5bb07
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/06d5bb07

Branch: refs/heads/master
Commit: 06d5bb073792345f7c4b7ecd0fb4454a335cc421
Parents: 118e3c0
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Sat Oct 28 13:44:37 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Sat Oct 28 13:45:52 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 163 +++++++++++++------
 .../gpu/ConvolutionGPUInstruction.java          |  43 ++++-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |  51 +++---
 .../sysml/test/gpu/NeuralNetworkOpTests.java    |  82 ++++++++++
 4 files changed, 260 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 50a7ca3..16a8b63 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -47,14 +47,23 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	private static final boolean THROW_ERROR_IF_INFERRED_SHAPE_MISMATCH = true;
 	// -------------------------------------------------------------------------
 	
+	// Specifies the type of this hop
 	private Hop.ConvOp op;
-
 	private int _maxNumThreads = -1; //-1 for unlimited
 
 	private ConvolutionOp() {
 		//default constructor for clone
 	}
 
+	/**
+	 * Create a hop from the builtin expression
+	 * 
+	 * @param l name of the hop
+	 * @param dt datatype (only supports matrix datatype)
+	 * @param vt valuetype  (only supports matrix valuetype) 
+	 * @param o type of this hop
+	 * @param inp input hops
+	 */
 	public ConvolutionOp(String l, DataType dt, ValueType vt, ConvOp o, ArrayList<Hop> inp) 
 	{
 		super(l, dt, vt);
@@ -75,8 +84,7 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		HopsException.check(_input.size() >= 1, this, "should have at least one input but has %d inputs", _input.size());
 	}
 
-	public ConvOp getOp()
-	{
+	public ConvOp getOp() {
 		return op;
 	}
 	
@@ -163,77 +171,129 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return input instanceof ConvolutionOp && ((ConvolutionOp) input).getOp() == ConvOp.DIRECT_CONV2D;
 	}
 	
+	/**
+	 * Compares the input parameters for max_pool/max_pool_backward operations
+	 * 
+	 * @return true if the following parameters match: stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2]
+	 */
+	private static boolean isPoolingParametersEqualAndKnown(ConvolutionParameters param1, ConvolutionParameters param2) {
+		return isEqualAndKnown(param1.stride_h, param2.stride_h) && isEqualAndKnown(param1.stride_w, param2.stride_w) && 
+			isEqualAndKnown(param1.pad_h, param2.pad_h) && isEqualAndKnown(param1.pad_w, param2.pad_w) &&
+			isEqualAndKnown(param1.R, param2.R) && isEqualAndKnown(param1.S, param2.S) &&
+			isEqualAndKnown(param1.N, param2.N) && isEqualAndKnown(param1.C, param2.C) &&
+			isEqualAndKnown(param1.H, param2.H) && isEqualAndKnown(param1.W, param2.W);
+	}
+	
+	private static boolean isEqualAndKnown(int val1, int val2) {
+		return val1 >= 0 && val2 >= 0 && val1 == val2;
+	}
+	
+	/**
+	 * Returns the output lop of maxpool operation with same parameters as this hop.
+	 * If corresponding output lop is not found or if this is not a max_pool_backward operation, this function returns null
+	 * 
+	 * @return output lop of maxpool operation with same parameters as this hop
+	 * @throws HopsException if error 
+	 * @throws LopsException if error
+	 */
+	private Lop getMaxPoolOutputLop() throws HopsException, LopsException {
+		if(op != ConvOp.MAX_POOLING_BACKWARD)
+			return null;
+		
+		Hop inputImage = getInput().get(0);
+		for(Hop tmpParent : inputImage.getParent()) {
+			if(!(tmpParent instanceof ConvolutionOp))
+				continue;
+			ConvolutionOp parent = (ConvolutionOp) tmpParent;
+			if(parent.getOp() == ConvOp.MAX_POOLING && isPoolingParametersEqualAndKnown(parent._cachedParams, _cachedParams)) {
+				return parent.constructLops();
+			}
+		}
+		return null;
+	}
+	
 	public Lop constructConvolutionLops(ExecType et, ArrayList<Hop> inputs) throws HopsException, LopsException {
 		if(inputs.size() != getNumExpectedInputs()) 
 			throw new HopsException("Incorrect number of inputs for " + op.name());
 		
-		Lop in = null; Lop in2 = null;
-		ArrayList<Hop> inputs1 = inputs;
-		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+		// ---------------------------------------------------------------
+		// Deal with fused operators and contruct lhsInputLop/optionalRhsInputLop
+		Lop lhsInputLop = null; Lop optionalRhsInputLop = null;
+		ArrayList<Hop> inputsOfPotentiallyFusedOp = inputs;
 		OperationTypes lopOp = HopsConv2Lops.get(op);
-
+		
 		// RELU_MAX_POOLING and RELU_MAX_POOLING_BACKWARD is extremely useful for CP backend 
 		// by reducing unnecessary sparse-to-dense-to-sparse conversion.
 		// For other backends, this operators is not necessary as it reduces an additional relu operator.
 		if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			lopOp = OperationTypes.RELU_MAX_POOLING;
 		}
 		else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && et == ExecType.CP && op == ConvOp.MAX_POOLING_BACKWARD && isInputReLU(inputs.get(0))) {
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD;
 		}
 		else if(OptimizerUtils.ALLOW_OPERATOR_FUSION && op == ConvOp.BIAS_ADD && isInputConv2d(inputs.get(0))) {
 			lopOp = OperationTypes.DIRECT_CONV2D_BIAS_ADD;
 			
 			// the first lop is image 
-			in = inputs.get(0).getInput().get(0).constructLops();
+			lhsInputLop = inputs.get(0).getInput().get(0).constructLops();
 			// the second lop is bias
-			in2 = inputs.get(1).constructLops();
+			optionalRhsInputLop = inputs.get(1).constructLops();
 			
 			// Use the inputs from conv2d rather than bias_add
-			inputs1 = inputs.get(0).getInput();
+			inputsOfPotentiallyFusedOp = inputs.get(0).getInput();
 		}
 		else {
-			in = inputs.get(0).constructLops();
+			lhsInputLop = inputs.get(0).constructLops();
 		}
+		// ---------------------------------------------------------------
 		
-//		// TODO: Inserting reblock requires knowing columns apriori
-//		ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k);
-//		setReblockedOutputDimension(et, transform1);
-		double cpIntermediateMemEstimate = computeIntermediateMemEstimate(-1, -1, -1 );
+		// ---------------------------------------------------------------
+		// Compute intermediate memory budget that can be passed to GPU operators 
+		// for better CuDNN operator selection at runtime
+		double intermediateMemEstimate = computeIntermediateMemEstimate(-1, -1, -1 );
 		if(et == ExecType.GPU && _dim1 > 0 && _dim2 > 0) {
 			// This enables us to compile more efficient matrix-matrix CuDNN operation instead of 
 			// row-by-row invocation of multiple vector-matrix CuDNN operations.
 			// This is possible as the operations on GPU are single-threaded
 			double optimisticIntermediateMemEstimate = GPUContextPool.initialGPUMemBudget() - getOutputMemEstimate() - inputs.get(0).getOutputMemEstimate();
-			if(in2 != null) {
+			if(optionalRhsInputLop != null) {
 				optimisticIntermediateMemEstimate -= inputs.get(1).getOutputMemEstimate();
 			}
-			cpIntermediateMemEstimate = Math.max(cpIntermediateMemEstimate, optimisticIntermediateMemEstimate);
+			intermediateMemEstimate = Math.max(intermediateMemEstimate, optimisticIntermediateMemEstimate);
 		}
-		ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, cpIntermediateMemEstimate);
-		setOutputDimensions(transform1);
+		// ---------------------------------------------------------------
 		
-		setLineNumbers(transform1);
-		in.addOutput(transform1);
+		// Contruct the lop
+		ConvolutionTransform convolutionLop = new ConvolutionTransform(lhsInputLop, lopOp, 
+				getDataType(), getValueType(), et, OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), intermediateMemEstimate);
 		
-		if(in2 != null) {
-			transform1.addInput(in2);
-			in2.addOutput(transform1);
-		}
+		// Propagate the output dimensions and the line number of ConvolutionOp to ConvolutionTransform
+		setOutputDimensions(convolutionLop);
+		setLineNumbers(convolutionLop);
 		
-		// stride1, stride2, padding1, padding2  
-		// input_shape1, input_shape2, input_shape3, input_shape4, 
-		// filter_shape1, filter_shape2, filter_shape3, filter_shape4
-		for( int i=1; i < inputs1.size(); i++ )
-		{
-			Lop ltmp = inputs1.get(i).constructLops();
-			transform1.addInput(ltmp);
-			ltmp.addOutput(transform1);
+		// ---------------------------------------------------------------
+		// Add input/output for parent lops of convolutionLop
+		lhsInputLop.addOutput(convolutionLop);
+		if(optionalRhsInputLop != null) {
+			convolutionLop.addInput(optionalRhsInputLop);
+			optionalRhsInputLop.addOutput(convolutionLop);
+		}
+		for( int i=1; i < inputsOfPotentiallyFusedOp.size(); i++ ) {
+			Lop ltmp = inputsOfPotentiallyFusedOp.get(i).constructLops();
+			convolutionLop.addInput(ltmp);
+			ltmp.addOutput(convolutionLop);
 		}
-		transform1.setLevel(); //force order of added lops
-		return transform1;
+		// Only valid for MAX_POOLING_BACKWARD on GPU
+		Lop optionalMaxPoolOutput = (et == ExecType.GPU) ? getMaxPoolOutputLop() : null; 
+		if(optionalMaxPoolOutput != null) {
+			convolutionLop.addInput(optionalMaxPoolOutput);
+			optionalMaxPoolOutput.addOutput(convolutionLop);
+		}
+		convolutionLop.setLevel(); //force order of added lops
+		// ---------------------------------------------------------------
+		return convolutionLop;
 	}
 
 			
@@ -453,12 +513,10 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? ExecType.SPARK : ExecType.MR;
 		
-		if( _etypeForced != null ) 			
-		{
+		if( _etypeForced != null ) {
 			_etype = _etypeForced;
 		}
-		else 
-		{	
+		else {	
 			if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
 				_etype = findExecTypeByMemEstimate();
 			}
@@ -479,8 +537,9 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		return _etype;
 	}
 	
-	// Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation
+	// Parameters recomputed in refreshSizeInformation and passed across many calls of getDim
 	private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
+	
 	// stride1, stride2, padding1, padding2  
 	// input_shape1, input_shape2, input_shape3, input_shape4, 
 	// filter_shape1, filter_shape2, filter_shape3, filter_shape4
@@ -494,16 +553,16 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 			imageHeightHop = getInput().get(8);
 			filterHeightHop = getInput().get(12);
 			_cachedParams.setIfUnknown(
-					getInput().get(6),
-					getInput().get(7), 
-					imageHeightHop, 
-					getInput().get(9), 
-					getInput().get(10), 
-					filterHeightHop, 
-					getInput().get(13), 
-					getInput().get(2), 
-					getInput().get(3), 
-					getInput().get(4), 
+					getInput().get(6),  // N
+					getInput().get(7),  // C
+					imageHeightHop,     // H
+					getInput().get(9),  // W
+					getInput().get(10), // K
+					filterHeightHop,    // R
+					getInput().get(13), // S
+					getInput().get(2),  // stride_h
+					getInput().get(3),  // stride_w
+					getInput().get(4),  // pad+h
 					getInput().get(5), _maxNumThreads);
 		}
 		else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 354ea63..8565b5a 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -92,8 +92,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		
 		if( ( opcode.equalsIgnoreCase("conv2d")
 			 || opcode.equalsIgnoreCase("conv2d_backward_filter")
-			 || opcode.equalsIgnoreCase("conv2d_backward_data")
-			 || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
+			 || opcode.equalsIgnoreCase("conv2d_backward_data")) ) {
 			InstructionUtils.checkNumFields(parts, 16);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand in2 = new CPOperand(parts[2]);
@@ -119,6 +118,39 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride,
 					padding, input_shape, filter_shape, Double.parseDouble(parts[16]));
 		}
+		else if( opcode.equalsIgnoreCase("maxpooling_backward") ) {
+			boolean withMaxPoolOut = false;
+			if(parts.length == 18) {
+				withMaxPoolOut = true;
+			}
+			else
+				InstructionUtils.checkNumFields(parts, 16);
+			CPOperand in1 = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = withMaxPoolOut ? new CPOperand(parts[15]) : null;
+			CPOperand out = withMaxPoolOut ? new CPOperand(parts[16]) : new CPOperand(parts[15]);
+			double memBudget = withMaxPoolOut ? Double.parseDouble(parts[17]) : Double.parseDouble(parts[16]);
+		
+			ArrayList<CPOperand> stride = new ArrayList<>();
+			ArrayList<CPOperand> padding = new ArrayList<>();
+			ArrayList<CPOperand> input_shape = new ArrayList<>();
+			ArrayList<CPOperand> filter_shape = new ArrayList<>();
+			stride.add(new CPOperand(parts[3]));
+			stride.add(new CPOperand(parts[4]));
+			padding.add(new CPOperand(parts[5]));
+			padding.add(new CPOperand(parts[6]));
+			input_shape.add(new CPOperand(parts[7]));
+			input_shape.add(new CPOperand(parts[8]));
+			input_shape.add(new CPOperand(parts[9]));
+			input_shape.add(new CPOperand(parts[10]));
+			filter_shape.add(new CPOperand(parts[11]));
+			filter_shape.add(new CPOperand(parts[12]));
+			filter_shape.add(new CPOperand(parts[13]));
+			filter_shape.add(new CPOperand(parts[14]));
+
+			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
+					padding, input_shape, filter_shape, memBudget);
+		}
 		else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
 			InstructionUtils.checkNumFields(parts, 17);
 			CPOperand in1 = new CPOperand(parts[1]);
@@ -324,7 +356,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
 			MatrixObject dout = getMatrixInputForGPUInstruction(ec, _input2.getName());
-			
+			MatrixObject maxPoolOutput = _input3 != null ? getMatrixInputForGPUInstruction(ec, _input3.getName()) : null;
 			if(dout.getNumRows() != N || dout.getNumColumns() != C*P*Q) 
 				throw new DMLRuntimeException("Incorrect dimensions for dout in maxpooling_backward");
 			if(image.getNumRows() != N || image.getNumColumns() != C*H*W) 
@@ -333,7 +365,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
 			
-			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+			LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, maxPoolOutput, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
 		}
 		else {
@@ -346,7 +378,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		if ( !instOpcode.equalsIgnoreCase("maxpooling") )
 			ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
-		if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))
+		if (instOpcode.equalsIgnoreCase("conv2d_bias_add") || 
+			(instOpcode.equalsIgnoreCase("maxpooling_backward") && _input3 != null))
 			ec.releaseMatrixInputForGPUInstruction(_input3.getName());
 
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index 7fd766c..e0a6a57 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -519,6 +519,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 * @param instName the invoking instruction's name for record {@link Statistics}.
 	 * @param image image as matrix object
 	 * @param dout			delta matrix, output of previous layer
+	 * @param maxpoolOutput (optional and can be null) output of maxpool forward function
 	 * @param outputBlock output matrix
 	 * @param N				batch size
 	 * @param C				number of channels
@@ -537,12 +538,14 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
+			MatrixObject maxpoolOutput, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q, double intermediateMemoryBudget) throws DMLRuntimeException {
 		long CHW = C*H*W; long CPQ = C*P*Q;  
 		long NCHW = N*CHW; long NCPQ = N*CPQ; 
 
+		final boolean isMaxPoolOutputProvided = maxpoolOutput != null;
+		
 		if(NCHW < maxNumElementsOfCuDNNTensor && NCPQ < maxNumElementsOfCuDNNTensor) {
 			// Filter and output are accounted as dense in the memory estimation for conv2dBackwardData
 			long overhead = isInSparseFormat(gCtx, image) ? OptimizerUtils.estimateSizeExactSparsity(N, CHW, 1.0) : 0;
@@ -551,19 +554,26 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 			if(overhead <= intermediateMemoryBudget) {
 				Pointer x = getDensePointerForCuDNN(gCtx, image, instName);
 				Pointer dy = getDensePointerForCuDNN(gCtx, dout, instName);
-				cudnnMaxpoolingBackward(gCtx, instName, x, dy, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+				Pointer y = isMaxPoolOutputProvided ? getDensePointerForCuDNN(gCtx, maxpoolOutput, instName) : null;
+				cudnnMaxpoolingBackward(gCtx, instName, x, dy, y, dx, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 			}
 			else {
 				LibMatrixCuDNNInputRowFetcher imgFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, image);
 				LibMatrixCuDNNInputRowFetcher doutFetcher = new LibMatrixCuDNNInputRowFetcher(gCtx, instName, dout);
+				LibMatrixCuDNNInputRowFetcher maxPoolOutFetcher = isMaxPoolOutputProvided ? new LibMatrixCuDNNInputRowFetcher(gCtx, instName, maxpoolOutput) : null;
 				for(int n = 0; n < N; n++) {
-					cudnnMaxpoolingBackward(gCtx, instName, imgFetcher.getNthRow(n), doutFetcher.getNthRow(n), 
+					Pointer x = imgFetcher.getNthRow(n);
+					Pointer dy = doutFetcher.getNthRow(n);
+					Pointer y = isMaxPoolOutputProvided ? maxPoolOutFetcher.getNthRow(n) : null;
+					cudnnMaxpoolingBackward(gCtx, instName, x, dy, y, 
 							dx.withByteOffset(n*CHW*sizeOfDataType), 
 							1, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 				}
 				// Deallocate temporary array to hold one element of input
 				imgFetcher.close();
 				doutFetcher.close();
+				if(isMaxPoolOutputProvided)
+					maxPoolOutFetcher.close();
 			}
 		}
 		else {
@@ -572,36 +582,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	}
 	
 	private static void cudnnMaxpoolingBackward(GPUContext gCtx, String instName, 
-			Pointer x, Pointer dy, Pointer dx, 
+			Pointer x, Pointer dy, Pointer y, Pointer dx, 
 			int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
 			int Q) throws DMLRuntimeException {
 		if(LOG.isTraceEnabled()) {
 			LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx);
 		}
-		Pointer y = null;
+		
+		boolean isMaxPoolOutputProvided = (y != null);
 
 		try(LibMatrixCuDNNPoolingDescriptors desc = 
 				LibMatrixCuDNNPoolingDescriptors.cudnnMaxpoolingBackwardDescriptors(gCtx, instName, N, C, H, W, K, R, S, 
 						pad_h, pad_w, stride_h, stride_w, P, Q)) {
 			long t1=0, t2=0, t3=0;
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			
-			// Calling PoolForward first, y is one of the inputs for poolBackward
-			// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
-			long numBytes = N*C*P*Q*sizeOfDataType;
-			y = gCtx.allocate(numBytes);
-			
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
-			
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			int status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
-
-			if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
-				throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+			int status;
+			if(!isMaxPoolOutputProvided) {
+				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
+				long numBytes = N*C*P*Q*sizeOfDataType;
+				y = gCtx.allocate(numBytes);
+				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
+				if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
+				status = cudnnPoolingForward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.xDesc, x, zero(), desc.yDesc, y);
+				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
+				if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+					throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+				}
 			}
-
 			if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
 			status = cudnnPoolingBackward(getCudnnHandle(gCtx), desc.poolingDesc, one(), desc.yDesc, y, desc.dyDesc, dy, desc.xDesc, x, zero(), desc.dxDesc, dx);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
@@ -615,7 +622,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		finally {
 			long t4=0;
 			if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
-			if(y != null)
+			if(!isMaxPoolOutputProvided)
 				gCtx.cudaFreeHelper(instName, y);
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/06d5bb07/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
index aba0cae..c57e997 100644
--- a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -579,5 +579,87 @@ public class NeuralNetworkOpTests extends GPUTests {
 			}
 		}
 	}
+	
+	
+	@Test
+	@Ignore
+	public void testMaxPoolBackwardWithMaxpoolOut() {
+		String scriptStr = "tmp = max_pool(image, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S]); print(sum(tmp)); O = max_pool_backward(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					long W = H;
+					for (long R : Rlst) {
+						long S = R;
+						for (long strideH : strideLst) {
+							long strideW = strideH;
+							for (long padH : padLst) {
+								long padW = padH;
+								for (double sparsity : sparsitylst) {
+
+									// pool is smaller than image + padding
+									if (R > (H + padH) || S > (W + padW))
+										continue;
+
+									// Make sure ops fit in GPU memory and within constraints of cudnn
+									long imageSize = N * C * H * W * 8l;
+									if (imageSize > MAX_OP_SIZE)  // image size
+										continue;
+									long poolSize = R * S * 8l;
+									if (poolSize > MAX_OP_SIZE)  // filter size
+										continue;
+
+									int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+									int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+									long doutSize = N * C * P * Q * 8l;
+									if (doutSize > MAX_OP_SIZE) // dout/output size
+										continue;
+
+									double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+									double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+									double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+									System.out
+									.format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+											N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+											P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+									Matrix image = generateInputMatrix(spark, (int) N,
+											(int) (C * H * W), -127.0, 127, sparsity, seed, true);
+									Matrix dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+											-127.0, 127, sparsity, seed, true);
+									HashMap<String, Object> inputs = new HashMap<>();
+									inputs.put("N", N);
+									inputs.put("C", C);
+									inputs.put("H", H);
+									inputs.put("W", W);
+									inputs.put("R", R);
+									inputs.put("S", S);
+									inputs.put("strideH", strideH);
+									inputs.put("strideW", strideW);
+									inputs.put("padH", padH);
+									inputs.put("padW", padW);
+									inputs.put("image", image);
+									inputs.put("dout", dout);
+									List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+											Arrays.asList("O"));
+									List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+											Arrays.asList("O"));
+									assertHeavyHitterPresent("gpu_maxpooling_backward");
+									assertEqualObjects(outCPU.get(0), outGPU.get(0));
+									clearGPUMemory();
+								}
+							}
+						}
+					}
+
+
+
+
+				}
+			}
+		}
+	}
 
 }

[42/50] [abbrv] systemml git commit: [SYSTEMML-540] Added a rewrite to support a common tensor operation (sum over channels)

Posted by re...@apache.org.

[SYSTEMML-540] Added a rewrite to support a common tensor operation (sum over channels)

- Added a rewrite to convert out = rowSums(matrix(colSums(A), rows=C, cols=HW)) to out = channel_sums(A) when nrow(A) > 1 and exectype is CP or GPU.
- This avoids unnecessary intermediates and GPU-CP-GPU transfer (for
  reshape). This saves about ~150 seconds on sentence CNN for 200 epochs.
- When we move to a higher CuDNN version, we can replace the custom channel_sums kernel with possibly more optimized CuDNN reduce tensor kernel.
- Added the corresponding CPU and GPU tests.
- Updated T_MAX(val) to MAX(). Interestingly enough, nvcc was smart enough
  to remove the parameter automatically, hence the ptx remained the same
  after the change.

Closes #693.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d916ba5b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d916ba5b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d916ba5b

Branch: refs/heads/master
Commit: d916ba5bd8ceec591a04f4d16c6d24f3985e3e4f
Parents: 2896f33
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Mon Oct 30 10:32:53 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Mon Oct 30 10:32:53 2017 -0700

----------------------------------------------------------------------
 src/main/cpp/kernels/Makefile                   |   2 +-
 src/main/cpp/kernels/SystemML.cu                | 308 ++++++++++---------
 .../java/org/apache/sysml/hops/AggUnaryOp.java  |  95 ++++--
 .../apache/sysml/lops/ConvolutionTransform.java |  42 ++-
 .../instructions/CPInstructionParser.java       |   1 +
 .../instructions/GPUInstructionParser.java      |   1 +
 .../cp/ConvolutionCPInstruction.java            |  86 ++++++
 .../gpu/ConvolutionGPUInstruction.java          |  47 +++
 .../spark/QuantilePickSPInstruction.java        |   2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  31 ++
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   4 +-
 .../sysml/test/gpu/AggregateUnaryOpTests.java   |  31 ++
 .../apache/sysml/test/gpu/UnaryOpTestsBase.java |   8 +-
 .../functions/tensor/ChannelSumTest.java        | 146 +++++++++
 .../scripts/functions/tensor/ChannelSumTest.R   |  39 +++
 .../scripts/functions/tensor/ChannelSumTest.dml |  35 +++
 16 files changed, 690 insertions(+), 188 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/cpp/kernels/Makefile
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile
index 5feae69..ec10317 100644
--- a/src/main/cpp/kernels/Makefile
+++ b/src/main/cpp/kernels/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 NVCC=nvcc
-CUDAFLAGS= -ptx -c -arch=sm_30 
+CUDAFLAGS= -ptx -c -arch=sm_30 --std c++11
 
 # Use these flags for precise math
 #CUDAFLAGS= -ptx -c -arch=sm_30 -ftz=false -prec-div=true -prec-sqrt=true

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index d176f8f..ade2dd1 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -20,7 +20,7 @@
 /**********************************
 When updating a kernel or adding a new one,
 please compile the ptx file and commit it:
-nvcc -ptx -arch=sm_30 SystemML.cu
+nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu
 ***********************************/
 
 #include <cfloat>
@@ -29,7 +29,8 @@ nvcc -ptx -arch=sm_30 SystemML.cu
 extern "C" __global__ void double2float_f(double *A, float *ret, int N) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < N) {
-  	// TODO: Use __double2float_rd or __double2float_rn  or __double2float_ru or __double2float_rz after 
+    // TODO: Use __double2float_rd or __double2float_rn  or __double2float_ru or
+    // __double2float_rz after
     ret[tid] = (float)A[tid];
   }
 }
@@ -84,15 +85,14 @@ __device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd,
      *
      * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex];
      * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL);
-     * slice_sparse_dense_row_helper<<< ceil(numThreads/
-*MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr,
-*colInd, ret,
-*			rl, ru, cl, cu, retClen, inRowPtr[rowIndex],
-*inRowPtr[rowIndex+1], index);
-*
-* Two-step compilation and linking process in JCudaKernels's constructor:
-* cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY,
-*"/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
+     * slice_sparse_dense_row_helper
+     * <<< ceil(numThreads/MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>
+     * (inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen, inRowPtr[rowIndex],
+     *	inRowPtr[rowIndex+1], index);
+     *
+     * Two-step compilation and linking process in JCudaKernels's constructor:
+     * cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY,
+     * "/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
      */
     // Iterate over elements of the row 'rowIndex'.
     for (int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex + 1]; i++) {
@@ -104,17 +104,18 @@ __device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd,
   }
 }
 
-extern "C" __global__ void slice_sparse_dense_row_d(double *inVal, int *inRowPtr,
-                                                   int *colInd, double *ret,
-                                                   int rl, int ru, int cl,
-                                                   int cu, int retClen) {
+extern "C" __global__ void slice_sparse_dense_row_d(double *inVal,
+                                                    int *inRowPtr, int *colInd,
+                                                    double *ret, int rl, int ru,
+                                                    int cl, int cu,
+                                                    int retClen) {
   slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 extern "C" __global__ void slice_sparse_dense_row_f(float *inVal, int *inRowPtr,
-                                                   int *colInd, float *ret,
-                                                   int rl, int ru, int cl,
-                                                   int cu, int retClen) {
+                                                    int *colInd, float *ret,
+                                                    int rl, int ru, int cl,
+                                                    int cu, int retClen) {
   slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
@@ -153,17 +154,18 @@ __device__ void slice_sparse_dense_nnz(T *inVal, int *inRowPtr, int *colInd,
   }
 }
 
-extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal, int *inRowPtr,
-                                                   int *colInd, double *ret,
-                                                   int rl, int ru, int cl,
-                                                   int cu, int retClen) {
+extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal,
+                                                    int *inRowPtr, int *colInd,
+                                                    double *ret, int rl, int ru,
+                                                    int cl, int cu,
+                                                    int retClen) {
   slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
 extern "C" __global__ void slice_sparse_dense_nnz_f(float *inVal, int *inRowPtr,
-                                                   int *colInd, float *ret,
-                                                   int rl, int ru, int cl,
-                                                   int cu, int retClen) {
+                                                    int *colInd, float *ret,
+                                                    int rl, int ru, int cl,
+                                                    int cu, int retClen) {
   slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen);
 }
 
@@ -194,16 +196,16 @@ __device__ void slice_dense_dense(T *in, T *ret, int rl, int ru, int cl, int cu,
 }
 
 extern "C" __global__ void slice_dense_dense_d(double *in, double *ret, int rl,
-                                              int ru, int cl, int cu,
-                                              int inClen, int retRlen,
-                                              int retClen) {
+                                               int ru, int cl, int cu,
+                                               int inClen, int retRlen,
+                                               int retClen) {
   slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
 }
 
 extern "C" __global__ void slice_dense_dense_f(float *in, float *ret, int rl,
-                                              int ru, int cl, int cu,
-                                              int inClen, int retRlen,
-                                              int retClen) {
+                                               int ru, int cl, int cu,
+                                               int inClen, int retRlen,
+                                               int retClen) {
   slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen);
 }
 
@@ -236,15 +238,15 @@ extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) {
 
 // Use this method in templates to fetch the maximum value for a given datatype
 template <typename T>
-__forceinline__ __device__ T T_MAX(T x) {
-  return (T)DBL_MAX;
+__forceinline__ __device__ T MAX() {
+  return T();
 }
 template <>
-__forceinline__ __device__ float T_MAX(float x) {
+__forceinline__ __device__ float MAX <float>() {
   return FLT_MAX;
 }
 template <>
-__forceinline__ __device__ double T_MAX(double x) {
+__forceinline__ __device__ double MAX <double>() {
   return DBL_MAX;
 }
 
@@ -311,7 +313,7 @@ __forceinline__ __device__ T binaryOp(T x, T y, int op) {
       }
     }
     default:
-      return T_MAX(x);
+      return MAX<T>();
   }
 }
 
@@ -342,7 +344,8 @@ extern "C" __global__ void relu_f(float *A, float *ret, int rlen, int clen) {
 }
 
 /**
- * This method computes the backpropagation errors for previous layer of relu operation
+ * This method computes the backpropagation errors for previous layer of relu
+ * operation
  *
  * @param X input activation array allocated on the GPU
  * @param dout errors from previous layer
@@ -361,12 +364,12 @@ __device__ void relu_backward(T *X, T *dout, T *ret, int rlen, int clen) {
 }
 
 extern "C" __global__ void relu_backward_d(double *X, double *dout, double *ret,
-                                          int rlen, int clen) {
+                                           int rlen, int clen) {
   relu_backward(X, dout, ret, rlen, clen);
 }
 
 extern "C" __global__ void relu_backward_f(float *X, float *dout, float *ret,
-                                          int rlen, int clen) {
+                                           int rlen, int clen) {
   relu_backward(X, dout, ret, rlen, clen);
 }
 
@@ -389,12 +392,12 @@ __device__ void inplace_add(T *input, T *ret, int rlen, int clen) {
 }
 
 extern "C" __global__ void inplace_add_d(double *input, double *ret, int rlen,
-                                        int clen) {
+                                         int clen) {
   inplace_add(input, ret, rlen, clen);
 }
 
 extern "C" __global__ void inplace_add_f(float *input, float *ret, int rlen,
-                                        int clen) {
+                                         int clen) {
   inplace_add(input, ret, rlen, clen);
 }
 
@@ -416,12 +419,12 @@ __device__ void bias_add(T *input, T *bias, T *ret, int rlen, int clen,
 }
 
 extern "C" __global__ void bias_add_d(double *input, double *bias, double *ret,
-                                     int rlen, int clen, int PQ) {
+                                      int rlen, int clen, int PQ) {
   bias_add(input, bias, ret, rlen, clen, PQ);
 }
 
 extern "C" __global__ void bias_add_f(float *input, float *bias, float *ret,
-                                     int rlen, int clen, int PQ) {
+                                      int rlen, int clen, int PQ) {
   bias_add(input, bias, ret, rlen, clen, PQ);
 }
 
@@ -443,16 +446,16 @@ __device__ void daxpy_matrix_vector(T *A, T *B, double alpha, T *ret, int rlenA,
 }
 
 extern "C" __global__ void daxpy_matrix_vector_d(double *A, double *B,
-                                                double alpha, double *ret,
-                                                int rlenA, int clenA, int rlenB,
-                                                int clenB) {
+                                                 double alpha, double *ret,
+                                                 int rlenA, int clenA,
+                                                 int rlenB, int clenB) {
   daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
 }
 
 extern "C" __global__ void daxpy_matrix_vector_f(float *A, float *B,
-                                                double alpha, float *ret,
-                                                int rlenA, int clenA, int rlenB,
-                                                int clenB) {
+                                                 double alpha, float *ret,
+                                                 int rlenA, int clenA,
+                                                 int rlenB, int clenB) {
   daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB);
 }
 
@@ -471,13 +474,14 @@ __device__ void bias_multiply(T *input, T *bias, T *ret, int rlen, int clen,
 }
 
 extern "C" __global__ void bias_multiply_d(double *input, double *bias,
-                                          double *ret, int rlen, int clen,
-                                          int PQ) {
+                                           double *ret, int rlen, int clen,
+                                           int PQ) {
   bias_multiply(input, bias, ret, rlen, clen, PQ);
 }
 
-extern "C" __global__ void bias_multiply_f(float *input, float *bias, float *ret,
-                                          int rlen, int clen, int PQ) {
+extern "C" __global__ void bias_multiply_f(float *input, float *bias,
+                                           float *ret, int rlen, int clen,
+                                           int PQ) {
   bias_multiply(input, bias, ret, rlen, clen, PQ);
 }
 
@@ -563,14 +567,14 @@ __device__ void matrix_scalar_op(T *A, T scalar, T *C, int size, int op,
 }
 
 extern "C" __global__ void matrix_scalar_op_d(double *A, double scalar,
-                                             double *C, int size, int op,
-                                             int isLeftScalar) {
+                                              double *C, int size, int op,
+                                              int isLeftScalar) {
   matrix_scalar_op(A, scalar, C, size, op, isLeftScalar);
 }
 
 extern "C" __global__ void matrix_scalar_op_f(float *A, double scalar, float *C,
-                                             int size, int op,
-                                             int isLeftScalar) {
+                                              int size, int op,
+                                              int isLeftScalar) {
   matrix_scalar_op(A, (float)scalar, C, size, op, isLeftScalar);
 }
 
@@ -635,12 +639,12 @@ __device__ void cbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
 }
 
 extern "C" __global__ void cbind_d(double *A, double *B, double *C, int rowsA,
-                                  int colsA, int rowsB, int colsB) {
+                                   int colsA, int rowsB, int colsB) {
   cbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
 extern "C" __global__ void cbind_f(float *A, float *B, float *C, int rowsA,
-                                  int colsA, int rowsB, int colsB) {
+                                   int colsA, int rowsB, int colsB) {
   cbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
@@ -684,12 +688,12 @@ __device__ void rbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB,
 }
 
 extern "C" __global__ void rbind_d(double *A, double *B, double *C, int rowsA,
-                                  int colsA, int rowsB, int colsB) {
+                                   int colsA, int rowsB, int colsB) {
   rbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
 extern "C" __global__ void rbind_f(float *A, float *B, float *C, int rowsA,
-                                  int colsA, int rowsB, int colsB) {
+                                   int colsA, int rowsB, int colsB) {
   rbind(A, B, C, rowsA, colsA, rowsB, colsB);
 }
 
@@ -828,15 +832,15 @@ template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_row(
     T *g_idata,  ///< input data stored in device memory (of size rows*cols)
     T *g_odata,  ///< output/temporary array store in device memory (of size
-                 ///rows*cols)
+    /// rows*cols)
     unsigned int rows,  ///< rows in input and temporary/output arrays
     unsigned int cols,  ///< columns in input and temporary/output arrays
     ReductionOp
         reduction_op,  ///< Reduction operation to perform (functor object)
     AssignmentOp assignment_op,  ///< Operation to perform before assigning this
-                                 ///to its final location in global memory for
-                                 ///each row
-    T initialValue) {            ///< initial value for the reduction variable
+    /// to its final location in global memory for
+    /// each row
+    T initialValue) {  ///< initial value for the reduction variable
   // extern __shared__ T sdata[];
   extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
   T *sdata = reinterpret_cast<T *>(my_sdata);
@@ -935,15 +939,15 @@ template <typename ReductionOp, typename AssignmentOp, typename T>
 __device__ void reduce_col(
     T *g_idata,  ///< input data stored in device memory (of size rows*cols)
     T *g_odata,  ///< output/temporary array store in device memory (of size
-                 ///rows*cols)
+    /// rows*cols)
     unsigned int rows,  ///< rows in input and temporary/output arrays
     unsigned int cols,  ///< columns in input and temporary/output arrays
     ReductionOp
         reduction_op,  ///< Reduction operation to perform (functor object)
     AssignmentOp assignment_op,  ///< Operation to perform before assigning this
-                                 ///to its final location in global memory for
-                                 ///each column
-    T initialValue)              ///< initial value for the reduction variable
+    /// to its final location in global memory for
+    /// each column
+    T initialValue)  ///< initial value for the reduction variable
 {
   unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (global_tid >= cols) {
@@ -990,12 +994,12 @@ __device__ void reduce_sum(T *g_idata, T *g_odata, unsigned int n) {
 }
 
 extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_sum(g_idata, g_odata, n);
 }
 
 extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_sum(g_idata, g_odata, n);
 }
 
@@ -1016,14 +1020,14 @@ __device__ void reduce_row_sum(T *g_idata, T *g_odata, unsigned int rows,
 }
 
 extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_sum(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_sum(g_idata, g_odata, rows, cols);
 }
 
@@ -1044,14 +1048,14 @@ __device__ void reduce_col_sum(T *g_idata, T *g_odata, unsigned int rows,
 }
 
 extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_sum(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_sum(g_idata, g_odata, rows, cols);
 }
 
@@ -1063,12 +1067,13 @@ struct MaxOp {
   __device__ __forceinline__ T operator()(T a, T b) const { return fmax(a, b); }
 };
 
-template<>
+template <>
 struct MaxOp<float> {
-  __device__ __forceinline__ float operator()(float a, float b) const { return fmaxf(a, b); }
+  __device__ __forceinline__ float operator()(float a, float b) const {
+    return fmaxf(a, b);
+  }
 };
 
-
 /**
  * Do a max over all elements of an array/matrix
  * @param g_idata   input data stored in device memory (of size n)
@@ -1078,16 +1083,16 @@ struct MaxOp<float> {
 template <typename T>
 __device__ void reduce_max(T *g_idata, T *g_odata, unsigned int n) {
   MaxOp<T> op;
-  reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -T_MAX(g_idata[0]));
+  reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -MAX<T>());
 }
 
 extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_max(g_idata, g_odata, n);
 }
 
 extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_max(g_idata, g_odata, n);
 }
 
@@ -1104,18 +1109,18 @@ __device__ void reduce_row_max(T *g_idata, T *g_odata, unsigned int rows,
   MaxOp<T> op;
   IdentityOp<T> aop;
   reduce_row<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
-                                         -T_MAX(g_idata[0]));
+                                         -MAX<T>());
 }
 
 extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_max(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_max(g_idata, g_odata, rows, cols);
 }
 
@@ -1132,18 +1137,18 @@ __device__ void reduce_col_max(T *g_idata, T *g_odata, unsigned int rows,
   MaxOp<T> op;
   IdentityOp<T> aop;
   reduce_col<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
-                                         (T)-T_MAX(g_idata[0]));
+                                         -MAX<T>());
 }
 
 extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_max(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_max(g_idata, g_odata, rows, cols);
 }
 
@@ -1164,16 +1169,16 @@ struct MinOp {
 template <typename T>
 __device__ void reduce_min(T *g_idata, T *g_odata, unsigned int n) {
   MinOp<T> op;
-  reduce<MinOp<T>, T>(g_idata, g_odata, n, op, T_MAX(g_idata[0]));
+  reduce<MinOp<T>, T>(g_idata, g_odata, n, op, MAX<T>());
 }
 
 extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_min(g_idata, g_odata, n);
 }
 
 extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata,
-                                       unsigned int n) {
+                                        unsigned int n) {
   reduce_min(g_idata, g_odata, n);
 }
 
@@ -1190,18 +1195,18 @@ __device__ void reduce_row_min(T *g_idata, T *g_odata, unsigned int rows,
   MinOp<T> op;
   IdentityOp<T> aop;
   reduce_row<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
-                                         T_MAX(g_idata[0]));
+                                         MAX<T>());
 }
 
 extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_min(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_row_min(g_idata, g_odata, rows, cols);
 }
 
@@ -1218,18 +1223,18 @@ __device__ void reduce_col_min(T *g_idata, T *g_odata, unsigned int rows,
   MinOp<T> op;
   IdentityOp<T> aop;
   reduce_col<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop,
-                                         T_MAX(g_idata[0]));
+                                         MAX<T>());
 }
 
 extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_min(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata,
-                                           unsigned int rows,
-                                           unsigned int cols) {
+                                            unsigned int rows,
+                                            unsigned int cols) {
   reduce_col_min(g_idata, g_odata, rows, cols);
 }
 
@@ -1254,12 +1259,12 @@ __device__ void reduce_prod(T *g_idata, T *g_odata, unsigned int n) {
 }
 
 extern "C" __global__ void reduce_prod_d(double *g_idata, double *g_odata,
-                                        unsigned int n) {
+                                         unsigned int n) {
   reduce_prod(g_idata, g_odata, n);
 }
 
 extern "C" __global__ void reduce_prod_f(float *g_idata, float *g_odata,
-                                        unsigned int n) {
+                                         unsigned int n) {
   reduce_prod(g_idata, g_odata, n);
 }
 
@@ -1293,14 +1298,14 @@ __device__ void reduce_row_mean(T *g_idata, T *g_odata, unsigned int rows,
 }
 
 extern "C" __global__ void reduce_row_mean_d(double *g_idata, double *g_odata,
-                                            unsigned int rows,
-                                            unsigned int cols) {
+                                             unsigned int rows,
+                                             unsigned int cols) {
   reduce_row_mean(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_row_mean_f(float *g_idata, float *g_odata,
-                                            unsigned int rows,
-                                            unsigned int cols) {
+                                             unsigned int rows,
+                                             unsigned int cols) {
   reduce_row_mean(g_idata, g_odata, rows, cols);
 }
 
@@ -1321,14 +1326,14 @@ __device__ void reduce_col_mean(T *g_idata, T *g_odata, unsigned int rows,
 }
 
 extern "C" __global__ void reduce_col_mean_d(double *g_idata, double *g_odata,
-                                            unsigned int rows,
-                                            unsigned int cols) {
+                                             unsigned int rows,
+                                             unsigned int cols) {
   reduce_col_mean(g_idata, g_odata, rows, cols);
 }
 
 extern "C" __global__ void reduce_col_mean_f(float *g_idata, float *g_odata,
-                                            unsigned int rows,
-                                            unsigned int cols) {
+                                             unsigned int rows,
+                                             unsigned int cols) {
   reduce_col_mean(g_idata, g_odata, rows, cols);
 }
 
@@ -1347,7 +1352,7 @@ __device__ void matrix_exp(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_exp_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_exp(A, C, size);
 }
 
@@ -1370,11 +1375,12 @@ __device__ void matrix_sqrt(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_sqrt_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_sqrt(A, C, size);
 }
 
-extern "C" __global__ void matrix_sqrt_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_sqrt_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_sqrt(A, C, size);
 }
 
@@ -1393,12 +1399,12 @@ __device__ void matrix_round(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_round_d(double *A, double *C,
-                                         unsigned int size) {
+                                          unsigned int size) {
   matrix_round(A, C, size);
 }
 
 extern "C" __global__ void matrix_round_f(float *A, float *C,
-                                         unsigned int size) {
+                                          unsigned int size) {
   matrix_round(A, C, size);
 }
 
@@ -1417,7 +1423,7 @@ __device__ void matrix_abs(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_abs_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_abs(A, C, size);
 }
 
@@ -1440,7 +1446,7 @@ __device__ void matrix_log(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_log_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_log(A, C, size);
 }
 
@@ -1463,12 +1469,12 @@ __device__ void matrix_floor(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_floor_d(double *A, double *C,
-                                         unsigned int size) {
+                                          unsigned int size) {
   matrix_floor(A, C, size);
 }
 
 extern "C" __global__ void matrix_floor_f(float *A, float *C,
-                                         unsigned int size) {
+                                          unsigned int size) {
   matrix_floor(A, C, size);
 }
 
@@ -1487,11 +1493,12 @@ __device__ void matrix_ceil(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_ceil_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_ceil(A, C, size);
 }
 
-extern "C" __global__ void matrix_ceil_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_ceil_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_ceil(A, C, size);
 }
 
@@ -1510,7 +1517,7 @@ __device__ void matrix_sin(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_sin_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_sin(A, C, size);
 }
 
@@ -1533,11 +1540,12 @@ __device__ void matrix_sinh(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_sinh_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_sinh(A, C, size);
 }
 
-extern "C" __global__ void matrix_sinh_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_sinh_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_sinh(A, C, size);
 }
 
@@ -1556,7 +1564,7 @@ __device__ void matrix_cos(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_cos_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_cos(A, C, size);
 }
 
@@ -1579,11 +1587,12 @@ __device__ void matrix_cosh(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_cosh_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_cosh(A, C, size);
 }
 
-extern "C" __global__ void matrix_cosh_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_cosh_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_cosh(A, C, size);
 }
 
@@ -1602,7 +1611,7 @@ __device__ void matrix_tan(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_tan_d(double *A, double *C,
-                                       unsigned int size) {
+                                        unsigned int size) {
   matrix_tan(A, C, size);
 }
 
@@ -1625,11 +1634,12 @@ __device__ void matrix_tanh(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_tanh_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_tanh(A, C, size);
 }
 
-extern "C" __global__ void matrix_tanh_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_tanh_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_tanh(A, C, size);
 }
 
@@ -1648,11 +1658,12 @@ __device__ void matrix_asin(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_asin_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_asin(A, C, size);
 }
 
-extern "C" __global__ void matrix_asin_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_asin_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_asin(A, C, size);
 }
 
@@ -1671,11 +1682,12 @@ __device__ void matrix_acos(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_acos_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_acos(A, C, size);
 }
 
-extern "C" __global__ void matrix_acos_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_acos_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_acos(A, C, size);
 }
 
@@ -1694,11 +1706,12 @@ __device__ void matrix_atan(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_atan_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_atan(A, C, size);
 }
 
-extern "C" __global__ void matrix_atan_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_atan_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_atan(A, C, size);
 }
 
@@ -1722,10 +1735,11 @@ __device__ void matrix_sign(T *A, T *C, unsigned int size) {
 }
 
 extern "C" __global__ void matrix_sign_d(double *A, double *C,
-                                        unsigned int size) {
+                                         unsigned int size) {
   matrix_sign(A, C, size);
 }
 
-extern "C" __global__ void matrix_sign_f(float *A, float *C, unsigned int size) {
+extern "C" __global__ void matrix_sign_f(float *A, float *C,
+                                         unsigned int size) {
   matrix_sign(A, C, size);
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 04a32bd..9b9406a 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -26,6 +26,7 @@ import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
 import org.apache.sysml.lops.Aggregate.OperationTypes;
 import org.apache.sysml.lops.Binary;
+import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.Group;
 import org.apache.sysml.lops.Lop;
 import org.apache.sysml.lops.LopsException;
@@ -131,6 +132,20 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 		return false;
 	}
 	
+	/**
+	 * Checks if channels sum rewrite is applicable
+	 * 
+	 * @return returns true for pattern rowSums(matrix(colSums(X), rows=.., cols=..)) else false
+	 */
+	private boolean isChannelSumRewriteApplicable() {
+		if( OptimizerUtils.ALLOW_OPERATOR_FUSION && _op == AggOp.SUM && _direction == Direction.Row
+			&& getInput().get(0) instanceof ReorgOp && ((ReorgOp)getInput().get(0)).getOp() == ReOrgOp.RESHAPE) {
+			Hop input1 = getInput().get(0).getInput().get(0);
+			return input1 instanceof AggUnaryOp && ((AggUnaryOp)input1)._op == AggOp.SUM && ((AggUnaryOp)input1)._direction == Direction.Col;
+		}
+		return false;
+	}
+	
 	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException 
@@ -147,41 +162,57 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 			if ( et == ExecType.CP || et == ExecType.GPU ) 
 			{
 				Lop agg1 = null;
-				if( isTernaryAggregateRewriteApplicable() ) {
-					agg1 = constructLopsTernaryAggregateRewrite(et);
+				long numChannels = isChannelSumRewriteApplicable() ? Hop.computeSizeInformation(getInput().get(0).getInput().get(1)) : -1;
+				if(numChannels > 0 && numChannels < 1000000) {
+					// Apply channel sums only if rewrite is applicable and if the dimension of C is known at compile time
+					// and if numChannels is less than 8 MB.
+					ReorgOp in = ((ReorgOp)getInput().get(0));
+					agg1 = new ConvolutionTransform(
+							in.getInput().get(0).getInput().get(0).constructLops(), 
+							in.getInput().get(1).constructLops(),
+							in.getInput().get(2).constructLops(),
+							ConvolutionTransform.OperationTypes.CHANNEL_SUMS, getDataType(), getValueType(), et, -1);
+					agg1.getOutputParameters().setDimensions(numChannels, 1, getRowsInBlock(), getColsInBlock(), -1);
+					setLineNumbers(agg1);
+					setLops(agg1);
 				}
-				else if( isUnaryAggregateOuterCPRewriteApplicable() )
-				{
-					OperationTypes op = HopsAgg2Lops.get(_op);
-					DirectionTypes dir = HopsDirection2Lops.get(_direction);
-
-					BinaryOp binput = (BinaryOp)getInput().get(0);
-					agg1 = new UAggOuterChain( binput.getInput().get(0).constructLops(), 
-							binput.getInput().get(1).constructLops(), op, dir, 
-							HopsOpOp2LopsB.get(binput.getOp()), DataType.MATRIX, getValueType(), ExecType.CP);
-					PartialAggregate.setDimensionsBasedOnDirection(agg1, getDim1(), getDim2(), input.getRowsInBlock(), input.getColsInBlock(), dir);
-				
+				else { 
+					if( isTernaryAggregateRewriteApplicable() ) {
+						agg1 = constructLopsTernaryAggregateRewrite(et);
+					}
+					else if( isUnaryAggregateOuterCPRewriteApplicable() )
+					{
+						OperationTypes op = HopsAgg2Lops.get(_op);
+						DirectionTypes dir = HopsDirection2Lops.get(_direction);
+	
+						BinaryOp binput = (BinaryOp)getInput().get(0);
+						agg1 = new UAggOuterChain( binput.getInput().get(0).constructLops(), 
+								binput.getInput().get(1).constructLops(), op, dir, 
+								HopsOpOp2LopsB.get(binput.getOp()), DataType.MATRIX, getValueType(), ExecType.CP);
+						PartialAggregate.setDimensionsBasedOnDirection(agg1, getDim1(), getDim2(), input.getRowsInBlock(), input.getColsInBlock(), dir);
+					
+						if (getDataType() == DataType.SCALAR) {
+							UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR),
+									                    getDataType(), getValueType());
+							unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
+							setLineNumbers(unary1);
+							setLops(unary1);
+						}
+					
+					}				
+					else { //general case		
+						int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+						agg1 = new PartialAggregate(input.constructLops(), 
+								HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k);
+					}
+					
+					setOutputDimensions(agg1);
+					setLineNumbers(agg1);
+					setLops(agg1);
+					
 					if (getDataType() == DataType.SCALAR) {
-						UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR),
-								                    getDataType(), getValueType());
-						unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1);
-						setLineNumbers(unary1);
-						setLops(unary1);
+						agg1.getOutputParameters().setDimensions(1, 1, getRowsInBlock(), getColsInBlock(), getNnz());
 					}
-				
-				}				
-				else { //general case		
-					int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-					agg1 = new PartialAggregate(input.constructLops(), 
-							HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k);
-				}
-				
-				setOutputDimensions(agg1);
-				setLineNumbers(agg1);
-				setLops(agg1);
-				
-				if (getDataType() == DataType.SCALAR) {
-					agg1.getOutputParameters().setDimensions(1, 1, getRowsInBlock(), getColsInBlock(), getNnz());
 				}
 			}
 			else if( et == ExecType.MR )

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index dfc187c..94a67f0 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -32,7 +32,7 @@ public class ConvolutionTransform extends Lop
 	public enum OperationTypes {
 		MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD, RELU_MAX_POOLING_BACKWARD,
 		DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA,
-		BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY
+		BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY, CHANNEL_SUMS
 	}
 	
 	private OperationTypes operation = null;
@@ -67,6 +67,18 @@ public class ConvolutionTransform extends Lop
 		input2.addOutput(this);
 		setLevel();
 	}
+	
+	public ConvolutionTransform(Lop input1, Lop input2, Lop input3, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
+	{
+		super(Lop.Type.Transform, dt, vt);		
+		init(input1, op, dt, vt, et);
+		numThreads = k;
+		this.addInput(input2);
+		input2.addOutput(this);
+		this.addInput(input3);
+		input3.addOutput(this);
+		setLevel();
+	}
 
 	private void init (Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et) 
 	{
@@ -142,6 +154,9 @@ public class ConvolutionTransform extends Lop
 		case DIRECT_CONV2D_BACKWARD_DATA:
 			return "conv2d_backward_data";
 			
+		case CHANNEL_SUMS:
+			return "channel_sums";
+			
 		default:
 			throw new UnsupportedOperationException(this.printErrorLocation() + "Instruction is not defined for Transform operation " + operation);
 				
@@ -180,6 +195,31 @@ public class ConvolutionTransform extends Lop
 	}
 	
 	@Override
+	public String getInstructions(String input, String C, String HW, String output) throws LopsException {
+		if(operation == OperationTypes.CHANNEL_SUMS) {
+			StringBuilder sb = new StringBuilder();
+			sb.append( getExecType() );
+			
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( getOpcode() );
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( getInputs().get(0).prepInputOperand(input));
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( getInputs().get(1).prepInputOperand(C));
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( getInputs().get(2).prepInputOperand(HW));
+			//output
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( this.prepOutputOperand(output));
+			
+			return sb.toString();
+		}
+		else {
+			throw new LopsException("The operation is not supported with three operands:" + operation.name());
+		}
+	}
+	
+	@Override
 	public String getInstructions(String[] inputs, String output) throws LopsException {
 		StringBuilder sb = new StringBuilder();
 		appendOpcode(sb);

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index 4e66042..d0bc429 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -233,6 +233,7 @@ public class CPInstructionParser extends InstructionParser
 		String2CPInstructionType.put( "conv2d_backward_data"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "bias_add"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "bias_multiply"      , CPINSTRUCTION_TYPE.Convolution);
+		String2CPInstructionType.put( "channel_sums"      , CPINSTRUCTION_TYPE.Convolution);
 		
 		// Quaternary instruction opcodes
 		String2CPInstructionType.put( "wsloss"  , CPINSTRUCTION_TYPE.Quaternary);

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 503576f..ae19969 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -53,6 +53,7 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( "maxpooling_backward",    GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "bias_add",               GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "bias_multiply",          GPUINSTRUCTION_TYPE.Convolution);
+		String2GPUInstructionType.put( "channel_sums",          GPUINSTRUCTION_TYPE.Convolution);
 
 		// Matrix Multiply Operators
 		String2GPUInstructionType.put( "ba+*",  GPUINSTRUCTION_TYPE.AggregateBinary);

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index c6b4698..36422d9 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -27,12 +27,14 @@ import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.functionobjects.KahanPlus;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
 import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
@@ -59,6 +61,19 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		_numThreads = numThreads;
 		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
+	
+	public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+				opcode, istr);
+		if( !opcode.equals("channel_sums") ) {
+			throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be channel_sums, but found " + opcode);
+		}
+		_in2 = in2;
+		_in3 = in3;
+		_cptype = CPINSTRUCTION_TYPE.Convolution;
+		_numThreads = numThreads;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
+	}
 
 	private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr,
 			ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
@@ -212,6 +227,14 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			int k = Integer.parseInt(parts[4]);
 			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5]));
 		}
+		else if (opcode.equalsIgnoreCase("channel_sums")) {
+			InstructionUtils.checkNumFields(parts, 4);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = new CPOperand(parts[3]);
+			CPOperand out = new CPOperand(parts[4]);
+			return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, -1, 0);
+		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str);
 		}
@@ -297,6 +320,65 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode());
 	}
 	
+	public void processChannelSumsInstruction(ExecutionContext ec) throws DMLRuntimeException {
+		MatrixBlock input = ec.getMatrixInput(input1.getName(), getExtendedOpcode());
+		int C = (int) ec.getScalarInput(_in2.getName(), _in2.getValueType(), _in2.isLiteral()).getLongValue();
+		int HW = (int) ec.getScalarInput(_in3.getName(), _in3.getValueType(), _in3.isLiteral()).getLongValue();
+		if(C*HW != input.getNumColumns()) {
+			throw new DMLRuntimeException("Expected rows*cols" + C + "*" + HW + " to be equal to number of columns of input " + input.getNumColumns());
+		}
+		MatrixBlock outputBlock = null;
+		if(input.isEmpty()) {
+			outputBlock = new MatrixBlock(C, 1, true);
+		}
+		else {
+			outputBlock = new MatrixBlock(C, 1, false).allocateBlock();
+			double [] output = outputBlock.getDenseBlock();
+			if(input.isInSparseFormat()) {
+				SparseBlock sblock = input.getSparseBlock();
+				for(int n = 0; n < input.getNumRows(); n++) {
+					if( sblock.isEmpty(n) )
+						continue;
+					int apos = sblock.pos(n);
+					int alen = sblock.size(n);
+					int[] aix = sblock.indexes(n);
+					double[] avals = sblock.values(n);
+					
+					// Iterate over the sparse block
+					for(int j=apos; j<apos+alen; j++) {
+						// Note: the input is of shape [N, CHW]
+						int chw = aix[j];
+						
+						// Get individual zero-based c,h,w indexes from zero-based 'chw'
+						int c = chw / HW;
+						output[c] += avals[j];
+					}
+				}
+			}
+			else {
+				double [] inArr = input.getDenseBlock();
+				if(inArr != null) {
+					KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+					for(int c = 0; c < C; c++) {
+						KahanObject sum = new KahanObject(0.0, 0.0);
+						for(int n = 0; n < input.getNumRows(); n++) {
+							int index =  n*C*HW + c*HW;
+							for(int hw = 0; hw < HW; hw++, index++) {
+								kplus.execute2(sum, inArr[index]);
+							}
+						}
+						output[c] = sum._sum;
+					}
+				}
+			}
+			outputBlock.recomputeNonZeros(getExtendedOpcode());
+		}
+		
+		// release inputs/outputs
+		ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
+		ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode());
+	}
+	
 	// Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true
 	// This increases the number of native calls. For example:the cases where filter is sparse but input is dense
 	private static boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException {
@@ -324,6 +406,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			processReluBackwardInstruction(ec);
 			return;
 		}
+		else if (instOpcode.equalsIgnoreCase("channel_sums")) {
+			processChannelSumsInstruction(ec);
+			return;
+		}
 		
 		// acquire inputs
 		MatrixBlock outputBlock = null;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 8565b5a..fdb208e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -20,12 +20,17 @@ package org.apache.sysml.runtime.instructions.gpu;
 
 import java.util.ArrayList;
 
+import jcuda.Pointer;
+
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.ConvolutionCPInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
@@ -57,6 +62,19 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		_intermediateMemoryBudget = intermediateMemoryBudget;
 	}
 	
+	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException {
+		super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
+		if( !opcode.equals("channel_sums") ) {
+			throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be channel_sums, but found " + opcode);
+		}
+		_input1 = in1;
+		_input2 = in2;
+		_input3 = in3;
+		_gputype = GPUINSTRUCTION_TYPE.Convolution;
+		_output = out;
+		_intermediateMemoryBudget = intermediateMemoryBudget;
+	}
+	
 	public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
 			String istr, ArrayList<CPOperand> stride,
 			ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
@@ -210,6 +228,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			CPOperand out = new CPOperand(parts[3]);
 			return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4]));
 		}
+		else if (opcode.equalsIgnoreCase("channel_sums")) {
+			InstructionUtils.checkNumFields(parts, 4);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = new CPOperand(parts[3]);
+			CPOperand out = new CPOperand(parts[4]);
+			return new ConvolutionGPUInstruction(in, in2, in3, out, opcode, str, 0);
+		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str);	
 		}
@@ -246,6 +272,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());
 	}
 	
+	public void processChannelSumsInstruction(ExecutionContext ec) throws DMLRuntimeException {
+		GPUStatistics.incrementNoOfExecutedGPUInst();
+		MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName());
+		int C = (int) ec.getScalarInput(_input2.getName(), _input2.getValueType(), _input2.isLiteral()).getLongValue();
+		int HW = (int) ec.getScalarInput(_input3.getName(), _input3.getValueType(), _input3.isLiteral()).getLongValue();
+		if(C*HW != input.getNumColumns()) {
+			throw new DMLRuntimeException("Expected rows*cols" + C + "*" + HW + " to be equal to number of columns of input " + input.getNumColumns());
+		}
+		MatrixObject outputBlock = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), C, 1);
+		
+		LibMatrixCUDA.channelSums(ec.getGPUContext(0), getExtendedOpcode(), input, outputBlock, C, HW);
+		
+		// release inputs/outputs
+		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+		ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+	}
+	
 	@Override
 	public void processInstruction(ExecutionContext ec) 
 			throws DMLRuntimeException 
@@ -258,6 +301,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
 			processReLUBackwardInstruction(ec);
 			return;
 		}
+		else if (instOpcode.equalsIgnoreCase("channel_sums")) {
+			processChannelSumsInstruction(ec);
+			return;
+		}
 		
 		GPUStatistics.incrementNoOfExecutedGPUInst();
 					

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java
index e7f515a..caaa9e8 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java
@@ -316,7 +316,7 @@ public class QuantilePickSPInstruction extends BinarySPInstruction {
 				sum += v2.next()._2().sumWeightForQuantile();
 			
 			//return tuple for partition aggregate
-			return Arrays.asList(new Tuple2<>(v1,sum)).iterator();
+			return Arrays.asList(new Tuple2<Integer, Double>(v1,sum)).iterator();
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 2cccde0..c0091c8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -322,6 +322,37 @@ public class LibMatrixCUDA {
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
 
 	}
+	
+	/**
+	 * Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW))
+	 * 
+	 * @param gCtx a valid {@link GPUContext}
+	 * @param instName the invoking instruction's name for record {@link Statistics}.
+	 * @param input input image
+	 * @param outputBlock output
+	 * @param C number of channels
+	 * @param HW height*width
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void channelSums(GPUContext gCtx, String instName, MatrixObject input, MatrixObject outputBlock, long C, long HW) throws DMLRuntimeException {
+		if(LOG.isTraceEnabled()) {
+			LOG.trace("GPU : channelSums" + ", GPUContext=" + gCtx);
+		}
+		int N = toInt(input.getNumRows());
+		int cols = toInt(input.getNumColumns());
+		if(cols != C*HW) {
+			throw new DMLRuntimeException("Incorrect parameters, number of columns " + cols + " != " + C + "*" + HW);
+		}
+		Pointer imagePointer = getDensePointer(gCtx, input, instName);
+		Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName);
+		
+		// We can replace this with CuDNN tensor reduce
+		Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType);
+		reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols);
+		reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW));
+		gCtx.cudaFreeHelper(tmp);
+
+	}
 
 	/**
 	 * Performs the operation corresponding to the DML script:

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
index e0a6a57..5935285 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java
@@ -64,7 +64,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 	protected static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getCudnnHandle();
 	}
-
+	
 	/**
 	 * Does a 2D convolution followed by a bias_add
 	 *
@@ -722,4 +722,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA {
 		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
index 0b229f0..59e9cb1 100644
--- a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
@@ -45,6 +45,37 @@ public class AggregateUnaryOpTests extends UnaryOpTestsBase {
 	public void colSums() {
 		testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+");
 	}
+	
+	@Test
+	public void channelSums() {
+		int[] rows = rowSizes;
+		int[] C = new int[] { 2, 5, 10, 50 };
+		int[] HW = new int[] { 10, 12, 21, 51 };
+		double[] sparsities = this.sparsities;
+		int seed = this.seed;	
+
+		for (int k = 0; k < sparsities.length; k++) {
+			double sparsity = sparsities[k];
+			if(sparsity == 0)
+				continue; // sparsity == 0 has been independently tested but it fails with non-informative mlcontext error
+			for (int i = 0; i < rows.length; i++) {
+				int row = rows[i];
+				if(row == 1)
+					continue; // Currently channel_sums rewrite is enabled only for row > 1
+				for (int c : C) {
+					if(c == 1)
+						continue; // C == 1 will result in scalar value, but this case has been independently tested
+					for (int hw : HW) {
+						// Skip the case of a scalar unary op
+						// System.out.println("Started channelSum test for " + row + " " + c + " " + hw + " " +  sparsity);
+						String scriptStr = "out = rowSums(matrix(colSums(in1), rows=" + c + ", cols=" + hw + "));";
+						testUnaryOpMatrixOutput(scriptStr, "gpu_channel_sums", "in1", "out", seed, row, c*hw, sparsity);
+						// System.out.println("Ended channelSum test for " + row + " " + c + " " + hw + " " +  sparsity);
+					}
+				}
+			}
+		}
+	}
 
 	@Test
 	public void rowSums() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
index 0051dd4..0f6b59c 100644
--- a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
@@ -31,10 +31,10 @@ import org.apache.sysml.api.mlcontext.Matrix;
 public abstract class UnaryOpTestsBase extends GPUTests {
 
 	// Set of rows and column sizes & sparsities to test unary ops
-	private final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 };
-	private final int[] columnSizes = new int[] { 2049, 1024, 140, 64, 1 };
-	private final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 };
-	private final int seed = 42;
+	protected final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 };
+	protected final int[] columnSizes = new int[] { 2049, 1024, 150, 64, 1 };
+	protected final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 };
+	protected final int seed = 42;
 
 	/**
 	 * Tests unary ops with a variety of matrix shapes and sparsities.

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java
new file mode 100644
index 0000000..61ca370
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.test.integration.functions.tensor;
+
+import java.util.HashMap;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+public class ChannelSumTest extends AutomatedTestBase
+{
+	
+	private final static String TEST_NAME = "ChannelSumTest";
+	private final static String TEST_DIR = "functions/tensor/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + PoolTest.class.getSimpleName() + "/";
+	private final static double epsilon=0.0000000001;
+	
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, 
+				new String[] {"B"}));
+	}
+	
+	@Test
+	public void testChannelSumDense1() 
+	{
+		int numImg = 10; int imgSize = 9; int numChannels = 5; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false);
+	}
+	
+	@Test
+	public void testChannelSumDense2() 
+	{
+		int numImg = 2; int imgSize = 5; int numChannels = 3; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false);
+	}
+	
+	@Test
+	public void testChannelSumDense3() 
+	{
+		int numImg = 9; int imgSize = 4; int numChannels = 11; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false);
+	}
+	
+	@Test
+	public void testChannelSumDense4() 
+	{
+		int numImg = 7; int imgSize = 8; int numChannels = 12; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false);
+	}
+	
+	@Test
+	public void testChannelSumSparse1() 
+	{
+		int numImg = 4; int imgSize = 10; int numChannels = 5; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true);
+	}
+	
+	@Test
+	public void testChannelSumSparse2() 
+	{
+		int numImg = 2; int imgSize = 10; int numChannels = 8; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true);
+	}
+	
+	@Test
+	public void testChannelSumSparse3() 
+	{
+		int numImg = 4; int imgSize = 10; int numChannels = 11; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true);
+	}
+	
+	@Test
+	public void testChannelSumSparse4() 
+	{
+		int numImg = 9; int imgSize = 6; int numChannels = 8; 
+		runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true);
+	}
+	
+	public void runChannelSumTest( ExecType et, int imgSize, int numImg, int numChannels, boolean sparse) 
+	{
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+		
+		try
+		{
+			String sparseVal = String.valueOf(sparse).toUpperCase();
+			
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+	
+			String RI_HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{"-explain", "hops", "-args", String.valueOf(imgSize), 
+				String.valueOf(numImg), String.valueOf(numChannels),
+				output("B"), sparseVal};
+			
+			fullRScriptName = RI_HOME + TEST_NAME + ".R";
+			rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + 
+				" " + numChannels + " " + expectedDir() + " " + sparseVal; 
+			
+			// run scripts
+			runTest(true, false, null, -1);
+			runRScript(true);
+			
+			//compare results
+			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
+			TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy");
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+		}
+	}
+	
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/scripts/functions/tensor/ChannelSumTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/ChannelSumTest.R b/src/test/scripts/functions/tensor/ChannelSumTest.R
new file mode 100644
index 0000000..c605074
--- /dev/null
+++ b/src/test/scripts/functions/tensor/ChannelSumTest.R
@@ -0,0 +1,39 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args <- commandArgs(TRUE)
+library("Matrix")
+library("matrixStats") 
+imgSize=as.integer(args[1])
+numImg=as.integer(args[2])
+numChannels=as.integer(args[3])
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize*imgSize, byrow=TRUE)
+if(as.logical(args[5])) {
+	zero_mask = (x - 1.5*mean(x)) > 0 
+	x = x * zero_mask
+} else {
+	x = x - mean(x)
+}
+
+output = rowSums(matrix(colSums(x), numChannels, imgSize*imgSize, byrow=TRUE));
+
+writeMM(as(output,"CsparseMatrix"), paste(args[4], "B", sep=""))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/scripts/functions/tensor/ChannelSumTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/ChannelSumTest.dml b/src/test/scripts/functions/tensor/ChannelSumTest.dml
new file mode 100644
index 0000000..7810a12
--- /dev/null
+++ b/src/test/scripts/functions/tensor/ChannelSumTest.dml
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# 
+#-------------------------------------------------------------
+imgSize=$1
+numImg=$2
+numChannels=$3
+
+# Assumption: NCHW image format
+x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, cols=numChannels*imgSize*imgSize)
+if($5) {
+	zero_mask = (x - 1.5*mean(x)) > 0 
+	x = x * zero_mask
+}
+else {
+	x = x - mean(x)
+}
+output = rowSums(matrix(colSums(x), rows=numChannels, cols=imgSize*imgSize))  # shape (C, 1)
+write(output, $4, format="text")
\ No newline at end of file

[26/50] [abbrv] systemml git commit: [SYSTEMML-540] Include the memory requirement of each layer in the summary table of Caffe2DML

Posted by re...@apache.org.

[SYSTEMML-540] Include the memory requirement of each layer in the summary table of Caffe2DML

- This helps the user to estimate the batch size she should set for
  optimal performance.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/881caa9b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/881caa9b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/881caa9b

Branch: refs/heads/master
Commit: 881caa9ba508b029f72f27d468bb33805704c7cb
Parents: 8f4ecdc
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 15:40:21 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 15:42:04 2017 -0700

----------------------------------------------------------------------
 docs/beginners-guide-caffe2dml.md               | 37 +++++++++-------
 .../org/apache/sysml/api/dl/Caffe2DML.scala     | 46 ++++++++++++++++++--
 2 files changed, 63 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/docs/beginners-guide-caffe2dml.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-caffe2dml.md b/docs/beginners-guide-caffe2dml.md
index 4d6b7fd..8814283 100644
--- a/docs/beginners-guide-caffe2dml.md
+++ b/docs/beginners-guide-caffe2dml.md
@@ -64,22 +64,27 @@ lenet.summary()
 Output:
 
 ```
-+-----+---------------+--------------+------------+---------+-----------+---------+
-| Name|           Type|        Output|      Weight|     Bias|        Top|   Bottom|
-+-----+---------------+--------------+------------+---------+-----------+---------+
-|mnist|           Data| (, 1, 28, 28)|            |         |mnist,mnist|         |
-|conv1|    Convolution|(, 32, 28, 28)|   [32 X 25]| [32 X 1]|      conv1|    mnist|
-|relu1|           ReLU|(, 32, 28, 28)|            |         |      relu1|    conv1|
-|pool1|        Pooling|(, 32, 14, 14)|            |         |      pool1|    relu1|
-|conv2|    Convolution|(, 64, 14, 14)|  [64 X 800]| [64 X 1]|      conv2|    pool1|
-|relu2|           ReLU|(, 64, 14, 14)|            |         |      relu2|    conv2|
-|pool2|        Pooling|  (, 64, 7, 7)|            |         |      pool2|    relu2|
-|  ip1|   InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]|        ip1|    pool2|
-|relu3|           ReLU| (, 512, 1, 1)|            |         |      relu3|      ip1|
-|drop1|        Dropout| (, 512, 1, 1)|            |         |      drop1|    relu3|
-|  ip2|   InnerProduct|  (, 10, 1, 1)|  [512 X 10]| [1 X 10]|        ip2|    drop1|
-| loss|SoftmaxWithLoss|  (, 10, 1, 1)|            |         |       loss|ip2,mnist|
-+-----+---------------+--------------+------------+---------+-----------+---------+
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+| Name|           Type|        Output|      Weight|     Bias|        Top|   Bottom|Memory* (train/test)|
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+|mnist|           Data| (, 1, 28, 28)|            |         |mnist,mnist|         |                 1/0|
+|conv1|    Convolution|(, 32, 28, 28)|   [32 X 25]| [32 X 1]|      conv1|    mnist|               25/12|
+|relu1|           ReLU|(, 32, 28, 28)|            |         |      relu1|    conv1|               25/12|
+|pool1|        Pooling|(, 32, 14, 14)|            |         |      pool1|    relu1|                 6/3|
+|conv2|    Convolution|(, 64, 14, 14)|  [64 X 800]| [64 X 1]|      conv2|    pool1|                38/7|
+|relu2|           ReLU|(, 64, 14, 14)|            |         |      relu2|    conv2|                12/6|
+|pool2|        Pooling|  (, 64, 7, 7)|            |         |      pool2|    relu2|                 3/2|
+|  ip1|   InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]|        ip1|    pool2|              797/13|
+|relu3|           ReLU| (, 512, 1, 1)|            |         |      relu3|      ip1|                 1/0|
+|drop1|        Dropout| (, 512, 1, 1)|            |         |      drop1|    relu3|                 1/0|
+|  ip2|   InnerProduct|  (, 10, 1, 1)|  [512 X 10]| [1 X 10]|        ip2|    drop1|                 3/0|
+| loss|SoftmaxWithLoss|  (, 10, 1, 1)|            |         |       loss|ip2,mnist|                 0/0|
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+
+Total number of layer outputs/errors/weights/bias/gradients: 5568768/5568768/1662752/618/106455680
+Total memory requirements for parameters* for train/test: 910/55
+[Advanced] Key network statistics to compute intermediate CP overhead batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): 64/48/(1, 1)/(0, 0).
+* => memory in megabytes assuming the parameters are in double precision and in dense format.
 ``` 
 
 To train the above lenet model, we use the MNIST dataset. 

http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
index 03b9a3b..56be5d6 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
@@ -50,6 +50,8 @@ import java.util.Random
 import org.apache.commons.logging.Log
 import org.apache.commons.logging.LogFactory
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer
+import org.apache.sysml.hops.OptimizerUtils
+import java.lang.Double
 
 /***************************************************************************************
 DESIGN OF CAFFE2DML:
@@ -306,10 +308,21 @@ class Caffe2DML(val sc: SparkContext,
   def getTrainAlgo(): String = if (inputs.containsKey("$train_algo")) inputs.get("$train_algo") else "minibatch"
   def getTestAlgo(): String  = if (inputs.containsKey("$test_algo")) inputs.get("$test_algo") else "minibatch"
 
+  private def getMemInBytes(l:CaffeLayer, batchSize:Int, isTraining:Boolean):Long = {
+    val numLayerOutput = l.outputShape._1.toLong * l.outputShape._2.toLong * l.outputShape._3.toLong  * batchSize
+    val numLayerError = numLayerOutput
+    val numLayerWeights = if(l.weightShape != null) l.weightShape()(0).toLong * l.weightShape()(1).toLong else 0
+    val numLayerBias = if(l.biasShape != null)l.biasShape()(0).toLong * l.biasShape()(1).toLong else 0
+    val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize
+    if(isTraining) (numLayerOutput + numLayerError + numLayerWeights + numLayerBias + numLayerGradients)*Double.BYTES
+    else (numLayerOutput + numLayerWeights + numLayerBias)*Double.BYTES
+  }
   def summary(sparkSession: org.apache.spark.sql.SparkSession): Unit = {
-    val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom")
-    val entries = net.getLayers
-      .map(l => (l, net.getCaffeLayer(l)))
+    val layers = net.getLayers .map(l => (l, net.getCaffeLayer(l)))
+    val numDataLayers = layers.filter(l => l._2.isInstanceOf[Data]).length
+    val batchSize = if(numDataLayers == 1) layers.filter(l => l._2.isInstanceOf[Data]).map(l => l._2.param.getDataParam.getBatchSize).get(0) else -1 
+    val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom", "Memory* (train/test)")
+    val entries = layers
       .map(l => {
         val layer = l._2
         (l._1,
@@ -318,10 +331,35 @@ class Caffe2DML(val sc: SparkContext,
          if (layer.weightShape != null) "[" + layer.weightShape()(0) + " X " + layer.weightShape()(1) + "]" else "",
          if (layer.biasShape != null) "[" + layer.biasShape()(0) + " X " + layer.biasShape()(1) + "]" else "",
          layer.param.getTopList.mkString(","),
-         layer.param.getBottomList.mkString(","))
+         layer.param.getBottomList.mkString(","), 
+         OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, true)) + "/" + OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, false))
+        )
       })
     import sparkSession.implicits._
     sc.parallelize(entries).toDF(header: _*).show(net.getLayers.size)
+    
+    val numLayerOutput = layers.map(l => l._2.outputShape._1.toLong * l._2.outputShape._2.toLong * l._2.outputShape._3.toLong).sum * batchSize
+    val numLayerError = numLayerOutput
+    val numLayerWeights = layers.map(l => if(l._2.weightShape != null) l._2.weightShape()(0).toLong * l._2.weightShape()(1).toLong else 0).sum
+    val numLayerBias = layers.map(l => if(l._2.biasShape != null) l._2.biasShape()(0).toLong * l._2.biasShape()(1).toLong else 0).sum
+    val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize
+    val convLayers = layers.filter(l => l._2.isInstanceOf[Convolution]).map(l => l._2.asInstanceOf[Convolution])
+    val crspq = convLayers.map(l => l.numChannels.toLong*l.kernel_h.toLong*l.kernel_w.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong) 
+    val kpq = convLayers.map(l => l.outputShape._1.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong)
+    
+    if(getTrainAlgo().equals("minibatch") && getTestAlgo().equals("minibatch")) {
+      System.out.println("Total number of layer outputs/errors/weights/bias/gradients: " + numLayerOutput + "/" + numLayerError +
+        "/" + numLayerWeights + "/" + numLayerBias + "/" + numLayerGradients)
+      System.out.println("Total memory requirements for parameters* for train/test: " +
+        OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, true)).sum) + "/" + 
+        OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, false)).sum))
+      System.out.println("[Advanced] Key network statistics to compute intermediate CP overhead " + 
+        "batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): " + 
+        batchSize + "/" + OptimizerUtils.getConstrainedNumThreads(-1) + "/(" +
+        OptimizerUtils.toMB(crspq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(crspq.max*Double.BYTES) + ")/(" + 
+        OptimizerUtils.toMB(kpq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(kpq.max*Double.BYTES) + ").")
+    }
+    System.out.println("* => memory in megabytes assuming the parameters are in double precision and in dense format.")
   }
 
   // ================================================================================================

[12/50] [abbrv] systemml git commit: [SYSTEMML-1967] Fix spark rand instruction (#partitions for sparse)

Posted by re...@apache.org.

[SYSTEMML-1967] Fix spark rand instruction (#partitions for sparse)

This patch fixes the spark rand instruction to create the correct number
of partitions under awareness of sparsity. So far, this method called a
primitive for size estimation with the number of non-zeros instead of
the sparsity, which led to dense estimates. 

Furthermore, this patch also fixes minor configuration issues of enabled
hand-coded fused operators.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4f29b348
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4f29b348
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4f29b348

Branch: refs/heads/master
Commit: 4f29b3485f4eb8a58aebd41eef22c5d0f92d632f
Parents: 5b8d626
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 17 23:09:40 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Oct 17 23:09:40 2017 -0700

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/BinaryOp.java     | 14 ++++++++------
 .../java/org/apache/sysml/hops/OptimizerUtils.java    |  2 +-
 .../runtime/instructions/spark/RandSPInstruction.java |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4f29b348/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index 58bbc8f..76c1a64 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -1373,8 +1373,8 @@ public class BinaryOp extends Hop
 
 	private static boolean requiresPartitioning( Hop rightInput )
 	{
-		return (   rightInput.dimsKnown() //known input size 
-                && rightInput.getDim1()*rightInput.getDim2() > DistributedCacheInput.PARTITION_SIZE);
+		return ( rightInput.dimsKnown() //known input size 
+			&& rightInput.getDim1()*rightInput.getDim2() > DistributedCacheInput.PARTITION_SIZE);
 	}
 	
 	public static boolean requiresReplication( Hop left, Hop right )
@@ -1393,9 +1393,10 @@ public class BinaryOp extends Hop
 		long m1_cpb = left.getColsInBlock();
 		
 		//MR_BINARY_UAGG_CHAIN only applied if result is column/row vector of MV binary operation.
-		if( right instanceof AggUnaryOp && right.getInput().get(0) == left  //e.g., P / rowSums(P)
+		if( OptimizerUtils.ALLOW_OPERATOR_FUSION
+			&& right instanceof AggUnaryOp && right.getInput().get(0) == left  //e.g., P / rowSums(P)
 			&& ((((AggUnaryOp) right).getDirection() == Direction.Row && m1_dim2 > 1 && m1_dim2 <= m1_cpb ) //single column block
-		    ||  (((AggUnaryOp) right).getDirection() == Direction.Col && m1_dim1 > 1 && m1_dim1 <= m1_rpb ))) //single row block
+			|| (((AggUnaryOp) right).getDirection() == Direction.Col && m1_dim1 > 1 && m1_dim1 <= m1_rpb ))) //single row block
 		{
 			return MMBinaryMethod.MR_BINARY_UAGG_CHAIN;
 		}
@@ -1430,9 +1431,10 @@ public class BinaryOp extends Hop
 		}
 		
 		//MR_BINARY_UAGG_CHAIN only applied if result is column/row vector of MV binary operation.
-		if( right instanceof AggUnaryOp && right.getInput().get(0) == left  //e.g., P / rowSums(P)
+		if( OptimizerUtils.ALLOW_OPERATOR_FUSION
+			&& right instanceof AggUnaryOp && right.getInput().get(0) == left  //e.g., P / rowSums(P)
 			&& ((((AggUnaryOp) right).getDirection() == Direction.Row && m1_dim2 > 1 && m1_dim2 <= m1_cpb ) //single column block
-		    ||  (((AggUnaryOp) right).getDirection() == Direction.Col && m1_dim1 > 1 && m1_dim1 <= m1_rpb ))) //single row block
+			|| (((AggUnaryOp) right).getDirection() == Direction.Col && m1_dim1 > 1 && m1_dim1 <= m1_rpb ))) //single row block
 		{
 			return MMBinaryMethod.MR_BINARY_UAGG_CHAIN;
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4f29b348/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index 5d831e5..d67e086 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -709,7 +709,7 @@ public class OptimizerUtils
 		//check for guaranteed existence of empty blocks (less nnz than total number of blocks)
 		long tnrblks = (long)Math.ceil((double)rlen/brlen);
 		long tncblks = (long)Math.ceil((double)clen/bclen);
-		long nnz = (long) Math.ceil(sp * rlen * clen);		
+		long nnz = (long) Math.ceil(sp * rlen * clen);
 		if( nnz < tnrblks * tncblks ) {
 			long lrlen = Math.min(rlen, brlen);
 			long lclen = Math.min(clen, bclen);

http://git-wip-us.apache.org/repos/asf/systemml/blob/4f29b348/src/main/java/org/apache/sysml/runtime/instructions/spark/RandSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/RandSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/RandSPInstruction.java
index b50bf73..2266eeb 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/RandSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/RandSPInstruction.java
@@ -350,7 +350,7 @@ public class RandSPInstruction extends UnarySPInstruction {
 		LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
 		PrimitiveIterator.OfLong nnzIter = nnz.iterator();
 		double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity( rows, cols, rowsInBlock, 
-			colsInBlock, rows*cols*sparsity); //overestimate for on disk, ensures hdfs block per partition
+			colsInBlock, sparsity); //overestimate for on disk, ensures hdfs block per partition
 		double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
 		long numBlocks = new MatrixCharacteristics(rows, cols, rowsInBlock, colsInBlock).getNumBlocks();
 		long numColBlocks = (long)Math.ceil((double)cols/(double)colsInBlock);

[14/50] [abbrv] systemml git commit: [SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion

Posted by re...@apache.org.

[SYSTEMML-446] [SYSTEMML-702] Updated the sparse matrix multiplication to minimize sparse-to-dense as well as dense-to-sparse conversion

1. The goal of this PR is not to improve performance (for example: by considering the cost of sparse-to-dense vs FLOPs required given a memory budget) but instead to minimize sparse-to-dense conversion in the GPU matrix multiplication operator.

2. If matmult uses unnecessary sparse-to-dense conversion, then we run into
  risk of one of the two situations:
- In best case some of the matmult won't be pushed to GPU under worst-case memory budget.
- On other hand, if these conversion are not accounted for, they may cause OOMs.

3. Every operator (except dense-sparse matrix multiplication) uses only memory allocated to input and output matrices.

4. Since there is no CuSPARSE kernel for dense-sparse matrix multiplication operator, we either have to transpose the output after performing sparse-dense matrix multiplication or perform dense-dense matrix multiplication after converting the sparse input to dense format.

Closes #686.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6de8f051
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6de8f051
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6de8f051

Branch: refs/heads/master
Commit: 6de8f051daaefdab403c0edd3c7beb30c9619033
Parents: 323dd72
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Oct 20 12:29:20 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Oct 20 13:29:20 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  30 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |   3 +-
 .../instructions/gpu/GPUInstruction.java        |   1 +
 .../instructions/gpu/context/CSRPointer.java    |   7 +-
 .../instructions/gpu/context/GPUObject.java     |   2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 541 +------------------
 .../runtime/matrix/data/LibMatrixCuMatMult.java | 480 ++++++++++++++++
 .../test/gpu/MatrixMultiplicationOpTest.java    |  98 +++-
 8 files changed, 581 insertions(+), 581 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index cfa99a4..d733d6a 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -371,40 +371,16 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 		double ret = 0;
 
 		if (isGPUEnabled()) {
-			// In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse
-			// When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2)
-			// When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2)
-
 			Hop in1 = _input.get(0);
 			Hop in2 = _input.get(1);
 			double in1Sparsity = OptimizerUtils.getSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
 			double in2Sparsity = OptimizerUtils.getSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-
 			boolean in1Sparse = in1Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
 			boolean in2Sparse = in2Sparsity < MatrixBlock.SPARSITY_TURN_POINT;
-
-			boolean in1UltraSparse = in1Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-			boolean in2UltraSparse = in2Sparsity < MatrixBlock.ULTRA_SPARSITY_TURN_POINT;
-
-			// For Matmult X * Y, if X is sparse, Y is dense, X is converted to dense
-			// If X is ultrasparse, Y is converted to sparse
-			if (in1Sparse ^ in2Sparse) { // one sparse, one dense
-				if (in1Sparse) {
-					if (in1UltraSparse) {
-						ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-					} else {
-						ret += OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
-					}
-				} else if (in2Sparse) {
-					if (in2UltraSparse) {
-						ret += 2 * OptimizerUtils.estimateSizeExactSparsity(in1.getDim1(), in1.getDim2(), in1.getNnz());
-					} else {
-						ret += OptimizerUtils.estimateSizeExactSparsity(in2.getDim1(), in2.getDim2(), in2.getNnz());
-					}
-				}
-
+			if(in1Sparse && !in2Sparse) {
+				// Only in sparse-dense cases, we need additional memory budget for GPU
+				ret += OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0);
 			}
-
 		}
 
 		//account for potential final dense-sparse transformation (worst-case sparse representation)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index dfc000f..29a1ead 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuMatMult;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
@@ -94,7 +95,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction {
 		int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns());
 
 		ec.setMetaData(_output.getName(), rlen, clen);
-		LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
+		LibMatrixCuMatMult.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
 		
 		//release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ed2a4a2..9dd163d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -78,6 +78,7 @@ public abstract class GPUInstruction extends Instruction {
 	public final static String MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB = 	"Mdmdm";	// time spent in matrix mult of dense matrices
 	public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB = 	"Msmdv";	// time spent in matrix mult of sparse matrix and dense vector
 	public final static String MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB = "Msmsm";  // time spent in matrix mult of sparse matrices
+	public final static String MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB = "Msmdm";  // time spent in matrix mult of sparse matrices
 	public final static String MISC_TIMER_SYRK_LIB = 												"Msyrk"; 	// time spent in symmetric rank-k update
 
 	// Other BLAS instructions

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 5a6e21c..7176a9c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -37,7 +37,9 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
 
 import jcuda.Pointer;
 import jcuda.Sizeof;
@@ -495,11 +497,13 @@ public class CSRPointer {
 	 * @param cublasHandle   a valid {@link cublasHandle}
 	 * @param rows           number of rows in this CSR matrix
 	 * @param cols           number of columns in this CSR matrix
+	 * @param instName          name of the invoking instruction to record{@link Statistics}.
 	 * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
-			int cols) throws DMLRuntimeException {
+			int cols, String instName) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS && instName != null ? System.nanoTime() : 0;
 		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
 				+ getGPUContext());
 		long size = ((long) rows) * getDoubleSizeOf((long) cols);
@@ -512,6 +516,7 @@ public class CSRPointer {
 		} else {
 			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
 		}
+		if (GPUStatistics.DISPLAY_STATISTICS && instName != null) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
 		return A;
 	}
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index feb34bc..06327db 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -458,7 +458,7 @@ public class GPUObject {
 			throw new DMLRuntimeException("Expected cusparse to be initialized");
 		int rows = toIntExact(mat.getNumRows());
 		int cols = toIntExact(mat.getNumColumns());
-		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols));
+		setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null));
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index f4a00ab..7e25299 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -22,18 +22,11 @@ package org.apache.sysml.runtime.matrix.data;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
 import static jcuda.jcusparse.JCusparse.cusparseDcsr2csc;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
-import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
-import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
-import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -175,11 +168,11 @@ public class LibMatrixCUDA {
 	}
 
 
-	private static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
+	protected static cusparseHandle getCusparseHandle(GPUContext gCtx) throws DMLRuntimeException{
 		return gCtx.getCusparseHandle();
 	}
 
-	private static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
+	protected static cublasHandle getCublasHandle(GPUContext gCtx) throws DMLRuntimeException {
 		return gCtx.getCublasHandle();
 	}
 
@@ -410,7 +403,7 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
 		if(isInSparseFormat(gCtx, left)) {
 			// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
-			matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
+			LibMatrixCuMatMult.matmult(ec, gCtx, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
 			return;
 		}
 
@@ -481,534 +474,6 @@ public class LibMatrixCUDA {
 	//******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
 	//********************************************************************/
 
-	//********************************************************************/
-	//***************** MATRIX MULTIPLY Functions ************************/
-	//********************************************************************/
-
-	/**
-	 * Matrix multiply on GPU
-	 * Examines sparsity and shapes and routes call to appropriate method
-	 * from cuBLAS or cuSparse
-	 * C = op(A) x op(B)
-	 * <p>
-	 * Memory Requirements -
-	 * Both dense - inputs, output, no intermediate
-	 * Both sparse - inputs, output, no intermediate
-	 * One sparse, one dense - inputs, output, intermediates - (input_dim1 * input_dim2) OR (input_dim1 * input_dim2 + input in sparse format)
-	 *
-	 * @param ec                Current {@link ExecutionContext} instance
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param left              Matrix A
-	 * @param right             Matrix B
-	 * @param outputName        Name of the output matrix C (in code generated after LOP layer)
-	 * @param isLeftTransposed  op for A, transposed or not
-	 * @param isRightTransposed op for B, tranposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 * @return output of matrix multiply
-	 */
-	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-		if (ec.getGPUContext(0) != gCtx)
-			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx);
-		}
-		if(!left.getGPUObject(gCtx).isAllocated() || !right.getGPUObject(gCtx).isAllocated())
-			throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject(gCtx).isAllocated() + " " + right.getGPUObject(gCtx).isAllocated());
-
-		boolean bothDense = !left.getGPUObject(gCtx).isSparse() && !right.getGPUObject(gCtx).isSparse();
-		boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse();
-
-		MatrixObject output = ec.getMatrixObject(outputName);
-
-		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
-		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
-
-		if (bothDense) {		// Dense C = Dense A * Dense B
-			// For both dense, do cuBLAS
-			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen); // Allocated the dense output matrix
-			denseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-		else if (bothSparse){	// Sparse C = Sparse A * Sparse B
-			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-			bothSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-		else {	// Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B
-			// Convert the dense to sparse and use the cusparseDcsrgemm routine
-			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
-			eitherSparseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed);
-		}
-
-		return output;
-	}
-
-	/**
-	 * One of the matrices is sparse, the other dense
-	 * C = op(A) x op(B)
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            allocated output object for C on host to which GPU output will be attached
-	 * @param left              Matrix A on host
-	 * @param right             Matrix B on host
-	 * @param isLeftTransposed  op for A, tranposed or not
-	 * @param isRightTransposed op for B, transposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
-		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
-		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
-		int k = toInt(isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
-		int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-
-		if (left.getGPUObject(gCtx).isSparse()) {
-			// Left sparse, right dense
-			sparseDenseMatmult(gCtx, instName, output, left, right, isLeftTransposed, isRightTransposed, m, n, k);
-		} else {
-			// Left dense, right sparse
-			denseSparseMatmult(gCtx, instName, left, right, output, isLeftTransposed, isRightTransposed, m, n, k);
-		}
-	}
-
-	/**
-	 * C = op(A) * op(B) where A is dense and B is sparse
-	 * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
-	 * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param left              {@link MatrixObject} of A
-	 * @param right             {@link MatrixObject} of B
-	 * @param output            {@link MatrixObject} of the output matrix C
-	 * @param isLeftTransposed  whether matrix A needs to be transposed
-	 * @param isRightTransposed whether matrix B needs to be transposed
-	 * @param m                 ?
-	 * @param n                 ?
-	 * @param k                 ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
-					throws DMLRuntimeException {
-		// right sparse, left dense
-		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		Pointer ADense = getDensePointer(gCtx, left, instName);
-		if (B.isUltraSparse(k, n)){
-			if(LOG.isTraceEnabled()) {
-				LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx);
-			}
-
-			// Convert left to CSR and do cuSparse matmul
-			int rowsA = (int)left.getNumRows();
-			int colsA = (int)left.getNumColumns();
-
-			long t0=0,t1=0, t2=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer AT = GPUObject.transpose(gCtx, ADense, rowsA, colsA, colsA, rowsA);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
-			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-			A.deallocate();
-			gCtx.cudaFreeHelper(AT);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-		} else {
-			if(LOG.isTraceEnabled()) {
-				LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx);
-			}
-			// Convert right to dense and do a cuBlas matmul
-			// BDenseTransposed is a column major matrix
-			// Note the arguments to denseDenseMatmult to accommodate for this.
-			long t0=0, t1=0;
-			if (DMLScript.STATISTICS) t0 = System.nanoTime();
-			Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns());
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-			if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-			boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
-			if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-			Pointer C = getDensePointer(gCtx, output, instName);
-			denseDenseMatmult(gCtx, instName, C,
-					toInt(left.getNumRows()), toInt(left.getNumColumns()),
-					toInt(right.getNumColumns()), toInt(right.getNumRows()),
-					isLeftTransposed, !isRightTransposed,
-					ADense, BDenseTransposed);
-
-			gCtx.cudaFreeHelper(instName, BDenseTransposed);
-		}
-	}
-
-	/**
-	 * * C = op(A) * op(B) where A is sparse and B is dense
-	 * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
-	 * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            the output matrix object
-	 * @param left              matrix A
-	 * @param right             matrix B
-	 * @param isLeftTransposed  if A needs to be transposed
-	 * @param isRightTransposed if B needs to be transposed
-	 * @param m                 ?
-	 * @param n                 ?
-	 * @param k                 ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k)
-					throws DMLRuntimeException {
-		CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		Pointer BDense = getDensePointer(gCtx, right, instName);
-
-		if (n == 1){
-			// Sparse Matrix - Dense Vector multiply
-			sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDense, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns());
-
-		} else {
-
-			long t0=0, t1=0, t2=0;
-			// Sparse Matrix Dense Matrix multiply
-			if (A.isUltraSparse(m, k)){
-				if(LOG.isTraceEnabled()) {
-					LOG.trace(" GPU : Convert sp M %*% d M --> sp M %*% sp M" + ", GPUContext=" + gCtx);
-				}
-				// Convert right to CSR and do cuSparse matmul
-				int rowsB = (int)right.getNumRows();
-				int colsB = (int)right.getNumColumns();
-
-				if (DMLScript.STATISTICS) t0 = System.nanoTime();
-				Pointer BT = GPUObject.transpose(gCtx, BDense, rowsB, colsB, colsB, rowsB);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-				CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
-
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1);
-
-				sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
-				B.deallocate();
-				gCtx.cudaFreeHelper(BT);
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
-
-			} else {
-				if(LOG.isTraceEnabled()) {
-					LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx);
-				}
-				// Convert left to dense and do a cuBlas matmul
-				// ADenseTransposed is a column major matrix
-				// Note the arguments to denseDenseMatmult to accommodate for this.
-				if (DMLScript.STATISTICS) t0 = System.nanoTime();
-				Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns());
-				if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0);
-				if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0);
-
-				if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-				boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense();	// To allocate the dense matrix
-				if (allocated && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
-
-				Pointer C = getDensePointer(gCtx, output, instName);
-				denseDenseMatmult(gCtx, instName, C,
-						toInt(left.getNumColumns()), toInt(left.getNumRows()),
-						toInt(right.getNumRows()), toInt(right.getNumColumns()),
-						!isLeftTransposed, isRightTransposed,
-						ADenseTransposed, BDense);
-
-				gCtx.cudaFreeHelper(instName, ADenseTransposed);
-			}
-		}
-	}
-
-	/**
-	 * C = op(A) x B
-	 * A is a sparse matrix, B is a dense vector
-	 *
-	 * @param gCtx         a valid {@link GPUContext}
-	 * @param instName     the invoking instruction's name for record {@link Statistics}.
-	 * @param output       allocated output on the host, to which the GPU output C will be attached
-	 * @param A            sparse matrix A on the GPU
-	 * @param B_dense      dense matrix/vector B on the GPU
-	 * @param isATranposed op for A, tranposed or not
-	 * @param m            number of rows in A (not op(A))
-	 * @param k            number of cols in A or number of rows in B (not op(A) or op(B))
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed,
-			int m, int k) throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx);
-		}
-		int transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-		long size = m * Sizeof.DOUBLE;
-		if (isATranposed){
-			size = k * Sizeof.DOUBLE;
-			transA = CUSPARSE_OPERATION_TRANSPOSE;
-		}
-		Pointer C_dense = gCtx.allocate(instName, (int)size);
-		long t1=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		cusparseDcsrmv(getCusparseHandle(gCtx), transA, m, k, (int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
-		//cudaDeviceSynchronize; 	// Since cusparseDcsrmv is asynchronously executed
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
-
-		output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
-	}
-
-	/**
-	 * Sparse C = Sparse op(A) * Sparse op(B)
-	 * Reroutes call to sparse matrix-vector mult if needed
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param output            ?
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param left              ?
-	 * @param right             ?
-	 * @param isLeftTransposed  ?
-	 * @param isRightTransposed ?
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-		int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
-		int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns());
-		int k = toInt(isLeftTransposed ? left.getNumRows() :  left.getNumColumns());
-		int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows());
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-		CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-		CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
-
-		// TODO if (m == 1) {	// Vector-matrix multiplication
-
-		if (!isRightTransposed && right.getNumColumns() == 1){ 	// Matrix-Vector multiplication
-			sparseMatrixVectorMult(gCtx, instName, output, isLeftTransposed, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
-		} else {												// Matrix-Matrix multiplication
-			sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k);
-		}
-	}
-
-	/**
-	 * Does a sparse matrix-vector multiply.
-	 * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
-	 *
-	 * @param gCtx         a valid {@link GPUContext}
-	 * @param instName     the invoking instruction's name for record {@link Statistics}.
-	 * @param output       allocated output object C to which the GPU output matrix will be attached
-	 * @param isATranposed if A is to be transposed or not (the op in op(A))
-	 * @param m            number of rows in A (not op(A))
-	 * @param n            number of cols in A (not op(A))
-	 * @param k            number of rows in B, (cols in B is assumed to be 1)
-	 * @param A            left sparse matrix on GPU
-	 * @param B            right sparse vector on GPU
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k,
-			CSRPointer A, CSRPointer B) throws DMLRuntimeException {
-		long t0=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
-		sparseMatrixDenseVectorMult(gCtx, instName, output, A, BDenseVector, isATranposed, m, k);
-	}
-
-	/**
-	 * Does a sparse-sparse Matrix multiply
-	 * C = op(A) x op(B), A, B are sparse matrices
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          the invoking instruction's name for record {@link Statistics}.
-	 * @param A                 left sparse matrix on GPU
-	 * @param B                 right sparse matrix on GPU
-	 * @param output            allocated output object on host to which the GPU output matrix will be attached
-	 * @param isLeftTransposed  op for A - to be transposed or not
-	 * @param isRightTransposed op for B
-	 * @param m                 number of rows in op(A)
-	 * @param n                 number of cols in op(B)
-	 * @param k                 number of cols in op(A) or rows in op(B)
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output,
-			boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx);
-		}
-
-		int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-		int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-		long t0=0, t1=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transA, B, transB, m, n, k);
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
-
-		output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-
-		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
-		cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
-				A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
-				B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
-				C.descr, C.val, C.rowPtr, C.colInd);
-		//cudaDeviceSynchronize;
-		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
-	}
-
-	/**
-	 * Dense dense matrix multiply
-	 * C = op(A) * op(B), A and B are dense matrices
-	 *
-	 * @param gCtx              a valid {@link GPUContext}
-	 * @param instName          name of the invoking instruction to record{@link Statistics}.
-	 * @param output            output object C on host with GPU data allocated
-	 * @param left              left matrix A (in row-major order)
-	 * @param right             right matrix B (in row-major order)
-	 * @param isLeftTransposed  op for A, transposed or not
-	 * @param isRightTransposed op for B, transposed or not
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right,
-			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
-
-		Pointer leftPtr = getDensePointer(gCtx, left, instName);
-		Pointer rightPtr = getDensePointer(gCtx, right, instName);
-
-		int leftRows = toInt(left.getNumRows());
-		int leftCols = toInt(left.getNumColumns());
-		int rightRows = toInt(right.getNumRows());
-		int rightCols = toInt(right.getNumColumns());
-		Pointer C = getDensePointer(gCtx, output, instName);
-		denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed,
-				leftPtr, rightPtr);
-	}
-
-	/**
-	 * Dense-dense matrix multiply
-	 * C = op(A) * op(B), A and B are dense matrices
-	 * On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
-	 * What we have as input is t(A) and t(B), t(X) = transpose of X.
-	 * We do t(B) %*% t(A) to get t(C);
-	 * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
-	 * What we really want is t(C). This we already have as the result of t(B) %*% t(A).
-	 *
-	 * @param gCtx               a valid {@link GPUContext}
-	 * @param instName           name of the invoking instruction to record{@link Statistics}.
-	 * @param output             output allocated on GPU in column major format
-	 * @param leftRows1          number of rows in A
-	 * @param leftCols1          number of cols in A
-	 * @param rightRows1         number of rows in B
-	 * @param rightCols1         number of cols in B
-	 * @param isLeftTransposed1  op for A, transposed or not
-	 * @param isRightTransposed1 op for B, transposed or not
-	 * @param leftPtr            A allocated on the GPU in row-major format
-	 * @param rightPtr           B allocated on the GPU in row-major format
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
-			int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
-					throws DMLRuntimeException {
-		if(LOG.isTraceEnabled()) {
-			LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx);
-		}
-
-		Pointer A = rightPtr;
-		Pointer B = leftPtr;
-
-		// To compensate for the input matrices being in row-major format instead of column-major (the way cublas expects)
-		int leftRows = rightCols1;
-		int leftCols = rightRows1;
-		int rightRows = leftCols1;
-		int rightCols = leftRows1;
-
-		boolean isLeftTransposed = isRightTransposed1;
-		boolean isRightTransposed = isLeftTransposed1;
-
-		// Note: the dimensions are swapped
-		int m = isLeftTransposed ? leftCols : leftRows ;
-		int n = isRightTransposed ? rightRows : rightCols;
-		int k = isLeftTransposed ?  leftRows : leftCols;
-		int k1 = isRightTransposed ?  rightCols : rightRows;
-		if(k != k1)
-			throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
-
-		if(m == -1 || n == -1 || k == -1)
-			throw new DMLRuntimeException("Incorrect dimensions");
-
-		double[] one = { 1 };
-		double[] zero = { 0 };
-
-		//int lda = leftRows;
-		//int ldb = leftCols;
-		int lda = isLeftTransposed ?  k : m;
-		int ldb = isRightTransposed ? n : k;
-		int ldc = m;
-
-		int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
-		int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
-
-		long t0=0;
-		if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
-		Pointer C = output;
-		if (m == 1 && n == 1){
-			// Vector product
-			LOG.debug(" GPU Dense-dense Vector Product");
-			double[] result = {0};
-			JCublas2.cublasDdot(getCublasHandle(gCtx), k, A, 1, B, 1, Pointer.to(result));
-			// By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
-			// This means that scalar values passed are on host (as opposed to on device).
-			// The result is copied from the host back to the device so that the rest of
-			// infrastructure can treat it uniformly.
-			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
-		} else if (m == 1) {
-			// Vector-matrix multiply
-			LOG.debug(" GPU Dense Vector-Matrix Multiply");
-			transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
-			JCublas2.cublasDgemv(getCublasHandle(gCtx), transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
-		} else if (n == 1){
-			// Matrix-vector multiply
-			LOG.debug(" GPU Dense Matrix-Vector Multiply");
-			JCublas2.cublasDgemv(getCublasHandle(gCtx), transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
-		} else {
-			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-			JCublas2.cublasDgemm(getCublasHandle(gCtx), transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
-			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
-		}
-	}
-
-	//********************************************************************/
-	//***************** END OF MATRIX MULTIPLY Functions *****************/
-	//********************************************************************/
-
 
 	//********************************************************************/
 	//****************  UNARY AGGREGATE Functions ************************/

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
new file mode 100644
index 0000000..21a2a35
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
+import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixCuMatMult extends LibMatrixCUDA {
+
+	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
+
+	private static class CuMatMultParameters {
+		/*
+		 * For the operation, C = op(A) %*% op(B), the below parameters are used
+		 * to invoke the corresponding kernels in CuBLAS and CuSPARSE.
+		 * 
+		 * All the below values have to be valid or else this class has to throw
+		 * an exception. No special values like -1 for unknowns allowed.
+		 */
+		public int m; // number of rows of matrix op(A) and C.
+		public int n; // number of columns of matrix op(B) and C.
+		public int k; // number of columns of op(A) and rows of op(B).
+		public int lda; // leading dimension of two-dimensional array used to
+						// store the matrix A.
+		public int ldb; // leading dimension of two-dimensional array used to
+						// store matrix B.
+		public int ldc; // leading dimension of a two-dimensional array used to
+						// store the matrix C.
+		public long leftNumRows; // number of rows of A
+		public long leftNumCols; // number of cols of A
+		public long rightNumRows; // number of rows of B
+		public long rightNumCols; // number of cols of B
+		private boolean isLeftTransposed; // is op(A) = t(A)
+		private boolean isRightTransposed; // is op(B) = t(B)
+
+		public CuMatMultParameters(long leftNumRows1, long leftNumCols1, long rightNumRows1, long rightNumCols1,
+				boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
+			leftNumRows = leftNumRows1;
+			leftNumCols = leftNumCols1;
+			rightNumRows = rightNumRows1;
+			rightNumCols = rightNumCols1;
+			isLeftTransposed = isLeftTransposed1;
+			isRightTransposed = isRightTransposed1;
+			setDimensions();
+		}
+
+		public void rowToColumnMajor() throws DMLRuntimeException {
+			// To compensate for the input matrices being in row-major format
+			// instead of column-major (the way cublas expects)
+			isRightTransposed = swap(isLeftTransposed, isLeftTransposed = isRightTransposed);
+			rightNumCols = swap(leftNumRows, leftNumRows = rightNumCols);
+			rightNumRows = swap(leftNumCols, leftNumCols = rightNumRows);
+			setDimensions();
+		}
+
+		private void validate() throws DMLRuntimeException {
+			int k1 = toInt(isRightTransposed ? rightNumCols : rightNumRows);
+			if (k != k1)
+				throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1 + " [" + leftNumRows + ","
+						+ leftNumCols + "," + rightNumRows + "," + rightNumCols + "], " + isLeftTransposed + " "
+						+ isRightTransposed);
+		}
+
+		private void setDimensions() throws DMLRuntimeException {
+			// Validate the dimensions
+			m = toInt(isLeftTransposed ? leftNumCols : leftNumRows);
+			n = toInt(isRightTransposed ? rightNumRows : rightNumCols);
+			k = toInt(isLeftTransposed ? leftNumRows : leftNumCols);
+			lda = isLeftTransposed ? k : m;
+			ldb = isRightTransposed ? n : k;
+			ldc = m;
+			if (m == -1 || n == -1 || k == -1)
+				throw new DMLRuntimeException("Incorrect dimensions");
+		}
+	}
+
+	/**
+	 * Matrix multiply on GPU Examines sparsity and shapes and routes call to
+	 * appropriate method from cuBLAS or cuSparse C = op(A) x op(B)
+	 *
+	 * The user is expected to call
+	 * ec.releaseMatrixOutputForGPUInstruction(outputName);
+	 *
+	 * @param ec
+	 *            Current {@link ExecutionContext} instance
+	 * @param gCtx
+	 *            a valid {@link GPUContext}
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param left
+	 *            Matrix A
+	 * @param right
+	 *            Matrix B
+	 * @param outputName
+	 *            Name of the output matrix C (in code generated after LOP
+	 *            layer)
+	 * @param isLeftTransposed
+	 *            op for A, transposed or not
+	 * @param isRightTransposed
+	 *            op for B, tranposed or not
+	 * @throws DMLRuntimeException
+	 *             if DMLRuntimeException occurs
+	 * @return output of matrix multiply
+	 */
+	public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left,
+			MatrixObject right, String outputName, boolean isLeftTransposed, boolean isRightTransposed)
+			throws DMLRuntimeException {
+		boolean isM1Sparse = isInSparseFormat(gCtx, left);
+		boolean isM2Sparse = isInSparseFormat(gCtx, right);
+		MatrixObject output = ec.getMatrixObject(outputName);
+		long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows();
+		long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns();
+
+		CuMatMultParameters params = new CuMatMultParameters(left.getNumRows(), left.getNumColumns(),
+				right.getNumRows(), right.getNumColumns(), isLeftTransposed, isRightTransposed);
+
+		if (isM1Sparse && isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// sparse-sparse matrix multiplication
+			params.validate();
+			int transa = cusparseOp(isLeftTransposed);
+			int transb = cusparseOp(isRightTransposed);
+
+			// Step 1: Allocate output => sparse format
+			ec.allocateGPUMatrixObject(outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			CSRPointer C = CSRPointer.allocateForMatrixMultiply(gCtx, getCusparseHandle(gCtx), A, transa, B, transb,
+					params.m, params.n, params.k);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB,
+						System.nanoTime() - t0);
+
+			// Step 3: Invoke the kernel
+			long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
+					C.val, C.rowPtr, C.colInd);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB,
+						System.nanoTime() - t1);
+			output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
+			// -------------------------------------------------------------------------------------
+		} else if (!isM1Sparse && isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// dense-sparse matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			Pointer A = getDensePointer(gCtx, left, instName);
+			CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			denseSparseMatMult(getCusparseHandle(gCtx), instName, C, A, B, params);
+			// -------------------------------------------------------------------------------------
+		} else if (isM1Sparse && !isM2Sparse) {
+			// -------------------------------------------------------------------------------------
+			// sparse-dense matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
+			Pointer B = getDensePointer(gCtx, right, instName);
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			sparseDenseMatMult(gCtx, instName, C, A, B, left.getNumRows(), left.getNumColumns(), right.getNumRows(),
+					right.getNumColumns(), outRLen, outCLen, isLeftTransposed, isRightTransposed);
+			// -------------------------------------------------------------------------------------
+		} else {
+			// -------------------------------------------------------------------------------------
+			// dense-dense matrix multiplication
+			// Step 1: Allocate output => dense format
+			getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, outRLen, outCLen);
+
+			// Step 2: Get the handles to sparse/dense pointers for left, right
+			// and output
+			Pointer A = getDensePointer(gCtx, left, instName);
+			Pointer B = getDensePointer(gCtx, right, instName);
+			Pointer C = getDensePointer(gCtx, output, instName);
+
+			// Step 3: Invoke the kernel
+			denseDenseMatMult(getCublasHandle(gCtx), instName, C, A, B, params);
+			// -------------------------------------------------------------------------------------
+		}
+		return output;
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+	 * allocated in dense row-major format and A is sparse.
+	 * 
+	 * Other than input and output, this method requires additional memory =
+	 * outRLen * outCLen * Sizeof.DOUBLE
+	 * 
+	 * @param gCtx
+	 *            a valid {@link GPUContext}
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param leftNumRows
+	 *            number of rows of A
+	 * @param leftNumColumns
+	 *            number of cols of A
+	 * @param rightNumRows
+	 *            number of rows of B
+	 * @param rightNumColumns
+	 *            number of cols of B
+	 * @param outRLen
+	 *            number of rows of C
+	 * @param outCLen
+	 *            number of cols of C
+	 * @param isLeftTransposed
+	 *            is op(A) = t(A)
+	 * @param isRightTransposed
+	 *            is op(B) = t(B)
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
+			long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
+			boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
+		// t(C) = t(B) %*% t(A)
+		Pointer output = null;
+		if (outRLen != 1 && outCLen != 1) {
+			output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+		} else {
+			// no transpose required for vector output
+			output = C;
+		}
+		CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
+				leftNumColumns, !isRightTransposed, !isLeftTransposed);
+		denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
+		if (outRLen != 1 && outCLen != 1) {
+			// Transpose: C = t(output)
+			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+			JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
+					toInt(outRLen), C, toInt(outCLen));
+			if (!DMLScript.EAGER_CUDA_FREE)
+				JCuda.cudaDeviceSynchronize();
+			gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE);
+			if (GPUStatistics.DISPLAY_STATISTICS)
+				GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime()
+						- t0);
+		}
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
+	 * allocated in dense row-major format and A is sparse.
+	 * 
+	 * @param handle
+	 *            cusparse handle
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param param
+	 *            BLAS parameters
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void denseSparseMatMult(cusparseHandle handle, String instName, Pointer C, Pointer A, CSRPointer B,
+			CuMatMultParameters param) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		String kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_MATRIX_LIB;
+		// Ignoring sparse vector dense matrix multiplication and dot product
+		boolean isVector = (param.leftNumRows == 1 && !param.isLeftTransposed)
+				|| (param.leftNumCols == 1 && param.isLeftTransposed);
+		if (isVector) {
+			LOG.debug(" GPU Sparse-Dense Matrix Vector ");
+			int m = toInt(param.rightNumRows);
+			int n = toInt(param.rightNumCols);
+			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+			JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+					zero(), C);
+			kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
+		} else {
+			int m = toInt(param.rightNumRows);
+			int k = toInt(param.rightNumCols);
+			param.rowToColumnMajor();
+			param.validate();
+			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
+			int transb = cusparseOp(param.isRightTransposed);
+			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
+			JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
+		}
+		if (GPUStatistics.DISPLAY_STATISTICS)
+			GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+	}
+
+	/**
+	 * Internal method to invoke the appropriate CuBLAS kernel for matrix
+	 * multiplication for operation: C = op(A) * op(B) This assumes A, B and C
+	 * are allocated in dense format. The caller is expected to invoke
+	 * params.rowToColumnMajor().
+	 * 
+	 * @param handle
+	 *            cublas handle
+	 * @param instName
+	 *            name of the invoking instruction to record{@link Statistics}.
+	 * @param C
+	 *            output matrix pointer
+	 * @param A
+	 *            left matrix pointer
+	 * @param B
+	 *            right matrix pointer
+	 * @param param
+	 *            BLAS parameters
+	 * @throws DMLRuntimeException
+	 *             if error
+	 */
+	private static void denseDenseMatMult(cublasHandle handle, String instName, Pointer C, Pointer A, Pointer B,
+			CuMatMultParameters param) throws DMLRuntimeException {
+		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
+		String kernel = null;
+		param.rowToColumnMajor();
+		param.validate();
+		int transa = cublasOp(param.isLeftTransposed);
+		int transb = cublasOp(param.isRightTransposed);
+		B = swap(A, A = B);
+		if (param.m == 1 && param.n == 1) {
+			// Vector product
+			LOG.debug(" GPU Dense-dense Vector Product");
+			double[] result = { 0 };
+			JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+			// By default in CuBlas V2, cublas pointer mode is set to
+			// CUBLAS_POINTER_MODE_HOST.
+			// This means that scalar values passed are on host (as opposed to
+			// on device).
+			// The result is copied from the host back to the device so that the
+			// rest of
+			// infrastructure can treat it uniformly.
+			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
+		} else if (param.m == 1) {
+			// Vector-matrix multiply
+			LOG.debug(" GPU Dense Vector-Matrix Multiply");
+			transb = reverseCublasOp(transb);
+			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
+			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
+			JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
+		} else if (param.n == 1) {
+			// Matrix-vector multiply
+			LOG.debug(" GPU Dense Matrix-Vector Multiply");
+			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
+			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
+			JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
+		} else {
+			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
+			JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+					zero(), C, param.ldc);
+			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
+		}
+		if (GPUStatistics.DISPLAY_STATISTICS)
+			GPUStatistics.maintainCPMiscTimes(instName, kernel, System.nanoTime() - t0);
+	}
+
+	// Convenient methods to swap two values
+	// Usage: y = swap(x, x=y);
+	private static long swap(long x, long y) {
+		return x;
+	}
+
+	private static boolean swap(boolean x, boolean y) {
+		return x;
+	}
+
+	private static Pointer swap(Pointer x, Pointer y) {
+		return x;
+	}
+
+	/**
+	 * Convenient wrapper to return appropriate cuSPARSE trans value
+	 * 
+	 * @param isTransposed
+	 *            is op(input) = t(input)
+	 * @return CUSPARSE_OPERATION_TRANSPOSE or CUSPARSE_OPERATION_NON_TRANSPOSE
+	 */
+	private static int cusparseOp(boolean isTransposed) {
+		return isTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+	}
+
+	/**
+	 * Convenient wrapper to return appropriate cuBLAS trans value
+	 * 
+	 * @param isTransposed
+	 *            is op(input) = t(input)
+	 * @return CUBLAS_OP_T or CUBLAS_OP_N
+	 */
+	private static int cublasOp(boolean isTransposed) {
+		return isTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
+	}
+
+	/**
+	 * Flips the cuBLAS trans value
+	 * 
+	 * @param trans
+	 *            can be CUBLAS_OP_T or CUBLAS_OP_N
+	 * @return CUBLAS_OP_N if trans is CUBLAS_OP_T else CUBLAS_OP_T
+	 */
+	private static int reverseCublasOp(int trans) {
+		return trans == cublasOperation.CUBLAS_OP_T ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
+	}
+
+	/**
+	 * Flips the cuSPARSE trans value
+	 * 
+	 * @param trans
+	 *            can be CUSPARSE_OPERATION_NON_TRANSPOSE or
+	 *            CUSPARSE_OPERATION_TRANSPOSE
+	 * @return CUSPARSE_OPERATION_NON_TRANSPOSE if trans is
+	 *         CUSPARSE_OPERATION_TRANSPOSE else CUSPARSE_OPERATION_TRANSPOSE
+	 */
+	private static int reverseCusparseOp(int trans) {
+		return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6de8f051/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index 81bc254..d983716 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,81 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y2 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y2 = { 1, 128, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseMLMatrixMatrixTest1() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 1000000 };
+		int[] X2 = { 1000 };
+		int[] Y2 = { 1, 20 };
+		double[] SX = { 0.0, 0.03, 0.3 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseDLMatrixMatrixTest1() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 100 };
+		int[] X2 = { 600, 900  };
+		int[] Y2 = { 205800 };
+		double[] SX = { 0.0, 0.03, 0.3 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	@Test
+	public void commonCaseDLMatrixMatrixTest2() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 64 };
+		int[] X2 = { 196608   };
+		int[] Y2 = { 512 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,9 +146,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest2() {
 		String scriptStr = "O = X %*% t(Y)";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y1 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y1 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -98,9 +170,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest3() {
 		String scriptStr = "O = t(X) %*% Y";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y2 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y2 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +194,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest4() {
 		String scriptStr = "O = t(X) %*% t(Y)";
 
-		int[] X1 = { 1, 128, 513, 1024 };
-		int[] X2 = { 128, 512, 1024 };
-		int[] Y1 = { 1, 128, 513, 1024 };
+		int[] X1 = { 1, 128, 1024 };
+		int[] X2 = { 1, 128, 1024 };
+		int[] Y1 = { 1, 128, 1024 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -146,7 +218,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void transposeSelfMatrixMultiply() {
 		String scriptStr = "O = t(X) %*% X";
 
-		int[] sizes = { 1, 128, 512, 1024, 2049 };
+		int[] sizes = { 1, 128, 1024 };
 		double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
 
 		for (int i = 0; i < sizes.length; i++) {

[48/50] [abbrv] systemml git commit: [SYSTEMML-1982] Improved IPA pass for recompile_once functions

Posted by re...@apache.org.

[SYSTEMML-1982] Improved IPA pass for recompile_once functions

This patch improved the existing inter-procedural-analysis pass for
recompile_once functions that are recompiled on function entry. So far
all functions with loops were marked for recompile_once. The problem is
that recompilation information is not available during IPA. Hence, we
correct these flags now after generating the runtime program and making
the decisions on recompilation.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ee6060bf
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ee6060bf
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ee6060bf

Branch: refs/heads/master
Commit: ee6060bfc1576cf777dfa99c48126a5a7b35db3a
Parents: a2f0598
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Nov 1 22:51:46 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Nov 2 00:39:17 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/parser/DMLTranslator.java  | 57 +++++++++-----------
 .../apache/sysml/parser/ForStatementBlock.java  | 15 ++----
 .../apache/sysml/parser/IfStatementBlock.java   | 12 ++---
 .../org/apache/sysml/parser/StatementBlock.java |  7 +--
 .../sysml/parser/WhileStatementBlock.java       | 12 ++---
 5 files changed, 45 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ee6060bf/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index 75103d1..fb8404f 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -317,25 +317,21 @@ public class DMLTranslator
 	}
 	
 	public void constructLops(DMLProgram dmlp) throws ParseException, LanguageException, HopsException, LopsException {
-
 		// for each namespace, handle function program blocks handle function 
-		for (String namespaceKey : dmlp.getNamespaces().keySet()){
-			for (String fname: dmlp.getFunctionStatementBlocks(namespaceKey).keySet()) {
-				FunctionStatementBlock current = dmlp.getFunctionStatementBlock(namespaceKey, fname);
-				constructLops(current);
-			}
-		}
+		for( String namespaceKey : dmlp.getNamespaces().keySet() )
+			for( FunctionStatementBlock fsb : dmlp.getFunctionStatementBlocks(namespaceKey).values() )
+				constructLops(fsb);
 		
 		// handle regular program blocks
-		for (int i = 0; i < dmlp.getNumStatementBlocks(); i++) {
-			StatementBlock current = dmlp.getStatementBlock(i);
-			constructLops(current);
-		}
+		for( StatementBlock sb : dmlp.getStatementBlocks() )
+			constructLops(sb);
 	}
 
-	public void constructLops(StatementBlock sb) 
+	public boolean constructLops(StatementBlock sb) 
 		throws HopsException, LopsException 
-	{	
+	{
+		boolean ret = false;
+		
 		if (sb instanceof WhileStatementBlock)
 		{
 			WhileStatementBlock wsb = (WhileStatementBlock)sb;
@@ -348,13 +344,13 @@ public class DMLTranslator
 			}
 			// step through stmt blocks in while stmt body
 			for (StatementBlock stmtBlock : body){
-				constructLops(stmtBlock);
+				ret |= constructLops(stmtBlock);
 			}
 			
 			// handle while stmt predicate
 			Lop l = wsb.getPredicateHops().constructLops();
 			wsb.set_predicateLops(l);	
-			wsb.updatePredicateRecompilationFlag();
+			ret |= wsb.updatePredicateRecompilationFlag();
 		}
 		
 		else if (sb instanceof IfStatementBlock)
@@ -370,16 +366,16 @@ public class DMLTranslator
 			}
 			// step through stmt blocks in if stmt ifBody
 			for (StatementBlock stmtBlock : ifBody)
-				constructLops(stmtBlock);
+				ret |= constructLops(stmtBlock);
 			
 			// step through stmt blocks in if stmt elseBody
 			for (StatementBlock stmtBlock : elseBody)
-				constructLops(stmtBlock);
+				ret |= constructLops(stmtBlock);
 			
 			// handle if stmt predicate
 			Lop l = isb.getPredicateHops().constructLops();
 			isb.set_predicateLops(l);
-			isb.updatePredicateRecompilationFlag();
+			ret |= isb.updatePredicateRecompilationFlag();
 		}
 		
 		else if (sb instanceof ForStatementBlock) //NOTE: applies to ForStatementBlock and ParForStatementBlock
@@ -394,7 +390,7 @@ public class DMLTranslator
 			}
 			// step through stmt blocks in FOR stmt body
 			for (StatementBlock stmtBlock : body)
-				constructLops(stmtBlock);
+				ret |= constructLops(stmtBlock);
 			
 			// handle for stmt predicate
 			if (fsb.getFromHops() != null){
@@ -409,37 +405,36 @@ public class DMLTranslator
 				Lop llobs = fsb.getIncrementHops().constructLops();
 				fsb.setIncrementLops(llobs);
 			}
-			fsb.updatePredicateRecompilationFlags();
+			ret |= fsb.updatePredicateRecompilationFlags();
 		}
-		else if (sb instanceof FunctionStatementBlock){
+		else if (sb instanceof FunctionStatementBlock) {
+			FunctionStatementBlock fsb = (FunctionStatementBlock) sb;
 			FunctionStatement functStmt = (FunctionStatement)sb.getStatement(0);
 			ArrayList<StatementBlock> body = functStmt.getBody();
-			
 			if (sb.get_hops() != null && !sb.get_hops().isEmpty()) {
 				LOG.error(sb.printBlockErrorLocation() + "FunctionStatementBlock should not have hops");
 				throw new HopsException(sb.printBlockErrorLocation() + "FunctionStatementBlock should not have hops");
 			}
 			// step through stmt blocks in while stmt body
-			for (StatementBlock stmtBlock : body){
-				constructLops(stmtBlock);
-			}
+			for( StatementBlock stmtBlock : body )
+				ret |= constructLops(stmtBlock);
+			if( fsb.isRecompileOnce() )
+				fsb.setRecompileOnce(ret);
 		}
 		
 		// handle default case for regular StatementBlock
 		else {
-			
 			if (sb.get_hops() == null)
 				sb.set_hops(new ArrayList<Hop>());
-			
 			ArrayList<Lop> lops = new ArrayList<>();
-			for (Hop hop : sb.get_hops()) {
+			for (Hop hop : sb.get_hops())
 				lops.add(hop.constructLops());
-			}
 			sb.setLops(lops);
-			sb.updateRecompilationFlag(); 
+			ret |= sb.updateRecompilationFlag(); 
 		}
 		
-	} // end method
+		return ret;
+	}
 	
 	
 	public Program getRuntimeProgram(DMLProgram prog, DMLConfig config) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/ee6060bf/src/main/java/org/apache/sysml/parser/ForStatementBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/ForStatementBlock.java b/src/main/java/org/apache/sysml/parser/ForStatementBlock.java
index 856e151..686ce7a 100644
--- a/src/main/java/org/apache/sysml/parser/ForStatementBlock.java
+++ b/src/main/java/org/apache/sysml/parser/ForStatementBlock.java
@@ -410,29 +410,24 @@ public class ForStatementBlock extends StatementBlock
 	// materialized hops recompilation flags
 	////
 	
-	public void updatePredicateRecompilationFlags() 
-		throws HopsException
-	{
+	public boolean updatePredicateRecompilationFlags() throws HopsException {
 		if( ConfigurationManager.isDynamicRecompilation() ) {
 			_requiresFromRecompile = Recompiler.requiresRecompilation(getFromHops());
 			_requiresToRecompile = Recompiler.requiresRecompilation(getToHops());
 			_requiresIncrementRecompile = Recompiler.requiresRecompilation(getIncrementHops());
 		}
+		return (_requiresFromRecompile || _requiresToRecompile || _requiresIncrementRecompile);
 	}
 	
-	public boolean requiresFromRecompilation()
-	{
+	public boolean requiresFromRecompilation() {
 		return _requiresFromRecompile;
 	}
 	
-	public boolean requiresToRecompilation()
-	{
+	public boolean requiresToRecompilation() {
 		return _requiresToRecompile;
 	}
 	
-	public boolean requiresIncrementRecompilation()
-	{
+	public boolean requiresIncrementRecompilation() {
 		return _requiresIncrementRecompile;
 	}
-	
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/ee6060bf/src/main/java/org/apache/sysml/parser/IfStatementBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/IfStatementBlock.java b/src/main/java/org/apache/sysml/parser/IfStatementBlock.java
index 2a66857..6803eb2 100644
--- a/src/main/java/org/apache/sysml/parser/IfStatementBlock.java
+++ b/src/main/java/org/apache/sysml/parser/IfStatementBlock.java
@@ -524,15 +524,13 @@ public class IfStatementBlock extends StatementBlock
 	// materialized hops recompilation flags
 	////
 	
-	public void updatePredicateRecompilationFlag() 
-		throws HopsException
-	{
-		_requiresPredicateRecompile =  ConfigurationManager.isDynamicRecompilation() 	
-			                           && Recompiler.requiresRecompilation(getPredicateHops());
+	public boolean updatePredicateRecompilationFlag() throws HopsException {
+		return (_requiresPredicateRecompile =
+			ConfigurationManager.isDynamicRecompilation()
+			&& Recompiler.requiresRecompilation(getPredicateHops()));
 	}
 	
-	public boolean requiresPredicateRecompilation()
-	{
+	public boolean requiresPredicateRecompilation() {
 		return _requiresPredicateRecompile;
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/ee6060bf/src/main/java/org/apache/sysml/parser/StatementBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/StatementBlock.java b/src/main/java/org/apache/sysml/parser/StatementBlock.java
index 4a24675..c2a3e01 100644
--- a/src/main/java/org/apache/sysml/parser/StatementBlock.java
+++ b/src/main/java/org/apache/sysml/parser/StatementBlock.java
@@ -1044,9 +1044,10 @@ public class StatementBlock extends LiveVariableAnalysis implements ParseInfo
 	// materialized hops recompilation / updateinplace flags
 	////
 
-	public void updateRecompilationFlag() throws HopsException {
-		_requiresRecompile = ConfigurationManager.isDynamicRecompilation()
-			                 && Recompiler.requiresRecompilation(get_hops());
+	public boolean updateRecompilationFlag() throws HopsException {
+		return (_requiresRecompile =
+			ConfigurationManager.isDynamicRecompilation()
+			&& Recompiler.requiresRecompilation(get_hops()));
 	}
 
 	public boolean requiresRecompilation() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/ee6060bf/src/main/java/org/apache/sysml/parser/WhileStatementBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/WhileStatementBlock.java b/src/main/java/org/apache/sysml/parser/WhileStatementBlock.java
index f3f613e..05e2c2c 100644
--- a/src/main/java/org/apache/sysml/parser/WhileStatementBlock.java
+++ b/src/main/java/org/apache/sysml/parser/WhileStatementBlock.java
@@ -324,15 +324,13 @@ public class WhileStatementBlock extends StatementBlock
 	// materialized hops recompilation flags
 	////
 	
-	public void updatePredicateRecompilationFlag() 
-		throws HopsException
-	{
-		_requiresPredicateRecompile =  ConfigurationManager.isDynamicRecompilation() 
-			                           && Recompiler.requiresRecompilation(getPredicateHops());
+	public boolean updatePredicateRecompilationFlag() throws HopsException {
+		return (_requiresPredicateRecompile = 
+			ConfigurationManager.isDynamicRecompilation() 
+			&& Recompiler.requiresRecompilation(getPredicateHops()));
 	}
 	
-	public boolean requiresPredicateRecompilation()
-	{
+	public boolean requiresPredicateRecompilation() {
 		return _requiresPredicateRecompile;
 	}
 }
\ No newline at end of file

[22/50] [abbrv] systemml git commit: [SYSTEMML-1970] Performance sparse conv2d operations w/ native libs

Posted by re...@apache.org.

[SYSTEMML-1970] Performance sparse conv2d operations w/ native libs

This patch improves the performance of sparse conv2d operations with
enabled native libraries (BLAS and native conv2d ops). So far, we called
custom native ops for sparse or dense inputs/filters, which for certain
scenarios internally converted the sparse to dense inputs/filters
respectively. With this patch, we now decide the implementation with
sparsity awareness. Additionally, this also includes a minor improvement
that avoids unnecessary nnz maintenance after native BLAS class.

On an end-to-end cnn application this patch improved performance from
605s to 349s per epoch w/ selective native ops (490s with Java ops).

Furthermore, this also fixes a recently introduced issues of
thread-local nnz maintenance in the native conv2d backward data op.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2c37d9f0
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2c37d9f0
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2c37d9f0

Branch: refs/heads/master
Commit: 2c37d9f03117d118c9d10c6839c4b1d60a4a9afc
Parents: 596005a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Oct 23 19:33:04 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 23 23:44:02 2017 -0700

----------------------------------------------------------------------
 .../cp/ConvolutionCPInstruction.java            |  6 +-
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 10 ---
 .../LibMatrixDNNConv2dBackwardDataHelper.java   |  3 +-
 .../runtime/matrix/data/LibMatrixDNNHelper.java | 71 +++++++++++++-------
 4 files changed, 50 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2c37d9f0/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 2c7b972..c6b4698 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -363,10 +363,8 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			}
 			else {
 				outputBlock = new MatrixBlock(N, C*H*W, false).allocateBlock();
-				if(instOpcode.equalsIgnoreCase("maxpooling_backward"))
-					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false);
-				else
-					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true);
+				LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, 
+					!instOpcode.equalsIgnoreCase("maxpooling_backward"));
 			}
 			ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c37d9f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index ac66e51..096574a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -34,7 +34,6 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
-import org.apache.sysml.utils.Statistics;
 
 /*
  * This class allows users to invoke deep learning related operations 
@@ -161,9 +160,6 @@ public class LibMatrixDNN {
 		if(params.bias != null && params.bias.isInSparseFormat())
 			params.bias.sparseToDense(); // Since bias is extremely small array
 		
-		if(isEligibleForConv2dSparse(params))
-			Statistics.numNativeSparseConv2dCalls.increment();
-		
 		long nnz = execute(LibMatrixDNNHelper.getConv2dWorkers(params), params);
 		
 		//post-processing: maintain nnz
@@ -183,9 +179,6 @@ public class LibMatrixDNN {
 	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
 		
-		if(isEligibleForConv2dBackwardDataDense(params))
-			Statistics.numNativeSparseConv2dBwdDataCalls.increment();
-		
 		long nnz = execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params);
 		
 		//post-processing: maintain nnz
@@ -205,9 +198,6 @@ public class LibMatrixDNN {
 	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
 		
-		if(isEligibleForConv2dBackwardFilterSparseDense(params))
-			Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
-		
 		execute(LibMatrixDNNHelper.getConv2dBackwardFilterWorkers(params), params);
 		
 		//post-processing: maintain nnz

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c37d9f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
index cd50000..960cea6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java
@@ -57,7 +57,8 @@ public class LibMatrixDNNConv2dBackwardDataHelper {
 						_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
 				System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW);
 			}
-			return 0L;
+			//multi-threaded nnz maintenance of current working set
+			return _params.output.recomputeNonZeros(_rl, _ru-1);
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c37d9f0/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index 92eb79b..55f6e4c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -24,11 +24,15 @@ import java.util.concurrent.Callable;
 
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.Conv2dBackwardFilter;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.SparseNativeConv2dBackwardFilterDense;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardDataHelper.*;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.*;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dHelper.*;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingBackwardHelper.*;
+import org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingHelper.*;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 
 public class LibMatrixDNNHelper {
@@ -51,9 +55,9 @@ public class LibMatrixDNNHelper {
 		int taskSize = (int)(Math.ceil((double)params.N / k));
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(params.input1.isInSparseFormat())
-				ret.add(new LibMatrixDNNPoolingHelper.SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else
-				ret.add(new LibMatrixDNNPoolingHelper.DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 		}
 		return ret;
 	}
@@ -73,15 +77,15 @@ public class LibMatrixDNNHelper {
 		for(int i = 0; i*taskSize < params.N; i++) {
 			if(!params.input1.isInSparseFormat()) {
 				if(!params.input2.isInSparseFormat()) 
-					ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+					ret.add(new PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
 				else
-					ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+					ret.add(new PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
 			}
 			else {
 				if(!params.input2.isInSparseFormat()) 
-					ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+					ret.add(new PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
 				else
-					ret.add(new LibMatrixDNNPoolingBackwardHelper.PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
+					ret.add(new PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward));
 			}
 		}
 		return ret;
@@ -123,32 +127,36 @@ public class LibMatrixDNNHelper {
 		// TODO: Decide here based on params whether to use LoopedIm2ColConv2dAllChannels or LoopedIm2ColConv2dOneChannel
 		// For now, let's stick to the existing approach of converting [1, CHW] to [CRS, PQ] as it allows matrix multiplication large enough matrix.
 		boolean allChannels = true; ArrayList<MatrixBlock> filters = null;
-		if(!allChannels) {
+		if(!allChannels)
 			filters = splitFilter(params);
-		}
 		
 		MatrixBlock in1 = params.input1;
 		boolean isEmptyDenseInput = !in1.isInSparseFormat() && in1.denseBlock == null;
 		boolean isTransPref = in1.sparse && !params.input2.sparse && 
 			MatrixBlock.evalSparseFormatInMemory(in1.clen, in1.rlen, in1.nonZeros);
+		boolean applyNative = LibMatrixDNN.isEligibleForConv2dSparse(params)
+			&& !(!isEmptyDenseInput && allChannels && isTransPref);
+		if( applyNative )
+			Statistics.numNativeSparseConv2dCalls.increment();
 		
 		//transpose filter once for efficient sparse-dense multiplies in LoopedIm2ColConv2dTransAllChan
 		//in order to share the temporary object and its creation costs across threads
-		if( !LibMatrixDNN.isEligibleForConv2dSparse(params) 
-			&& !isEmptyDenseInput && allChannels && isTransPref ) {
+		if( !applyNative && !isEmptyDenseInput && allChannels && isTransPref ) {
 			params.input2 = LibMatrixReorg.transpose(params.input2, 
 				new MatrixBlock(params.input2.clen, params.input2.rlen, false), k);
 		}
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
-			if(LibMatrixDNN.isEligibleForConv2dSparse(params)) 
-				ret.add(new LibMatrixDNNConv2dHelper.SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+			//note: we prefer the java backend for sparse inputs because the native 
+			//implementation simply converts the sparse input into dense rows
+			if( applyNative ) 
+				ret.add(new SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput && allChannels && isTransPref)
-				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput && allChannels)
-				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput && !allChannels)
-				ret.add(new LibMatrixDNNConv2dHelper.LoopedIm2ColConv2dOneChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters));
+				ret.add(new LoopedIm2ColConv2dOneChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters));
 			else
 				throw new DMLRuntimeException("Unsupported operator");
 		}
@@ -172,9 +180,15 @@ public class LibMatrixDNNHelper {
 		
 		boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || 
 			(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+		boolean applyNative = LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)
+			&& !params.input2.isInSparseFormat();
+		if( applyNative )
+			Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
-			if(LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params)) 
+			//note: we prefer the java backend for sparse filters because the native 
+			//implementation simply rotates the sparse filters into dense rows
+			if( applyNative ) 
 				ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput)
 				ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
@@ -202,12 +216,18 @@ public class LibMatrixDNNHelper {
 		
 		boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || 
 			(!params.input2.isInSparseFormat() && params.input2.denseBlock == null);
+		boolean applyNative = LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params)
+			&& !params.input2.isInSparseFormat();
+		if( applyNative )
+			Statistics.numNativeSparseConv2dBwdDataCalls.increment();
 		
 		for(int i = 0; i*taskSize < params.N; i++) {
-			if(LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params)) 
-				ret.add(new LibMatrixDNNConv2dBackwardDataHelper.SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+			//note: we prefer the java backend for sparse filters because the native 
+			//implementation simply converts the sparse filters into dense rows
+			if( applyNative ) 
+				ret.add(new SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else if(!isEmptyDenseInput)
-				ret.add(new LibMatrixDNNConv2dBackwardDataHelper.Conv2dBackwardData(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
+				ret.add(new Conv2dBackwardData(i*taskSize, Math.min((i+1)*taskSize, params.N), params));
 			else
 				throw new DMLRuntimeException("Unsupported operator");
 		}
@@ -319,20 +339,21 @@ public class LibMatrixDNNHelper {
 	// Single-threaded matrix multiplication
 	static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, 
 			boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException {
-		if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) {
+		if( !params.enableNative || m1.sparse || m2.sparse ) {
 			prepNonZerosForMatrixMult(m1, recomputeNNZM1);
 			prepNonZerosForMatrixMult(m2, recomputeNNZM2);
 			LibMatrixMult.matrixMult(m1, m2, ret, false);
-			ret.setNonZeros((long)ret.rlen*ret.clen);
 		}
 		else {
 			ret.sparse = false;
 			if(ret.getDenseBlock() == null)
 				ret.allocateDenseBlock();
 			NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
-					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1);
-			ret.recomputeNonZeros();
+				ret.denseBlock, m1.rlen, m1.clen, m2.clen, 1);
 		}
+		
+		//no need to maintain nnz exactly, as consumed by other operations
+		ret.setNonZeros((long)ret.rlen*ret.clen);
 	}
 	
 	static void addBias(int r, double [] out, double [] bias, int K, int PQ) {

[04/50] [abbrv] systemml git commit: [SYSTEMML-1963] Fix missing codegen dense-sparse vector primitives

Posted by re...@apache.org.

[SYSTEMML-1963] Fix missing codegen dense-sparse vector primitives

This patch fixes issues that showed up with the fuse-all heuristic on
different scenarios of ALS-CG. In particular, this adds missing vector
primitives for dense-sparse vector operations and modifies the code
generator accordingly. Most of these operations are invariant to the
ordering of inputs and hence simply call the existing primitives with
permuted inputs.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/b6b67727
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/b6b67727
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/b6b67727

Branch: refs/heads/master
Commit: b6b67727b9ec271995520e47dc3044eccaed2b65
Parents: 06b4b9d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 20:27:51 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 20:27:51 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/cplan/CNodeBinary.java   | 33 ++++-----
 .../runtime/codegen/LibSpoofPrimitives.java     | 70 ++++++++++++++++++++
 2 files changed, 88 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/b6b67727/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index 1ca4aa6..cac8ab8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -68,16 +68,16 @@ public class CNodeBinary extends CNode
 			return ssComm || vsComm || vvComm;
 		}
 		
-		public String getTemplate(boolean sparse, boolean scalarVector, boolean scalarInput) {
+		public String getTemplate(boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
 			switch (this) {
 				case DOT_PRODUCT:   
-					return sparse ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
+					return sparseLhs ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
 									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				case VECT_MATRIXMULT:   
-					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
+					return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
 									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				case VECT_OUTERMULT_ADD:   
-					return sparse ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+					return sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
 									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
 				
 				//vector-scalar-add operations
@@ -96,10 +96,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_ADD: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : 
+						return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
 					else	
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : 
+						return sparseLhs ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
 				}
 				
@@ -119,10 +119,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_SCALAR: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
+						return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
 					else	
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
+						return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
 				}
 				
@@ -130,7 +130,7 @@ public class CNodeBinary extends CNode
 					if( scalarInput )
 						return  "    double[] %TMP% = LibSpoofPrimitives.vectCBindWrite(%IN1%, %IN2%);\n";
 					else
-						return sparse ? 
+						return sparseLhs ? 
 								"    double[] %TMP% = LibSpoofPrimitives.vectCBindWrite(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
 								"    double[] %TMP% = LibSpoofPrimitives.vectCBindWrite(%IN1%, %IN2%, %POS1%, %LEN%);\n";
 				
@@ -140,7 +140,7 @@ public class CNodeBinary extends CNode
 				case VECT_MINUS:
 				case VECT_PLUS:
 				case VECT_MIN:
-				case VECT_MAX:	
+				case VECT_MAX:
 				case VECT_EQUAL:
 				case VECT_NOTEQUAL:
 				case VECT_LESS:
@@ -148,8 +148,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATER:
 				case VECT_GREATEREQUAL: {
 					String vectName = getVectorPrimitiveName();
-					return sparse ? 
+					return sparseLhs ? 
 						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : 
+						   sparseRhs ?
+						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %POS1%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
 						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				}
 				
@@ -269,14 +271,15 @@ public class CNodeBinary extends CNode
 		sb.append(_inputs.get(1).codegen(sparse));
 		
 		//generate binary operation (use sparse template, if data input)
-		boolean lsparse = sparse 
-			&& ((_inputs.get(0) instanceof CNodeData && _inputs.get(0).getVarname().startsWith("a"))
-			||(_inputs.get(1) instanceof CNodeData && _inputs.get(1).getVarname().startsWith("a")));
+		boolean lsparseLhs = sparse && _inputs.get(0) instanceof CNodeData 
+			&& _inputs.get(0).getVarname().startsWith("a");
+		boolean lsparseRhs = sparse && _inputs.get(1) instanceof CNodeData 
+			&& _inputs.get(1).getVarname().startsWith("a");	
 		boolean scalarInput = _inputs.get(0).getDataType().isScalar();
 		boolean scalarVector = (_inputs.get(0).getDataType().isScalar()
 			&& _inputs.get(1).getDataType().isMatrix());
 		String var = createVarname();
-		String tmp = _type.getTemplate(lsparse, scalarVector, scalarInput);
+		String tmp = _type.getTemplate(lsparseLhs, lsparseRhs, scalarVector, scalarInput);
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace input references and start indexes

http://git-wip-us.apache.org/repos/asf/systemml/blob/b6b67727/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 6b4aad7..8444b5f 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -183,6 +183,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectMultWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectMultWrite(b, a, bix, ai, bi, blen, len);
+	}
+	
 	public static void vectWrite(double[] a, double[] c, int ci, int len) {
 		if( a == null ) return;
 		System.arraycopy(a, 0, c, ci, len);
@@ -345,6 +350,18 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectDivWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++ ) {
+			double aval = a[bi + j];
+			c[j] = (aval==0) ? Double.NaN : (aval>0) ? 
+				Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
+		}
+		for( int j = bi; j < bi+blen; j++ )
+			c[bix[j]] = a[ai+bix[j]] / b[j];
+		return c;
+	}
+	
 	//custom vector minus
 	
 	public static void vectMinusAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -417,6 +434,14 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectMinusWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		double[] c = allocVector(len, false);
+		System.arraycopy(a, ai, c, 0, len);
+		for( int j = bi; j < bi+blen; j++ )
+			c[bix[j]] -= b[j];
+		return c;
+	}
+	
 	//custom vector plus
 	
 	public static void vectPlusAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -475,6 +500,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectPlusWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectPlusWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector pow
 	
 	public static void vectPowAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -600,6 +630,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectMinWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectMinWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector max
 	
 	public static void vectMaxAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -661,6 +696,11 @@ public class LibSpoofPrimitives
 			c[aix[j]] = Math.max(a[j], b[bi + aix[j]]);
 		return c;
 	}
+	
+	public static double[] vectMaxWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectMaxWrite(b, a, bix, bi, ai, blen, len);
+	}
 
 	//custom exp
 	
@@ -1385,6 +1425,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectEqualWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector not equal
 	
 	public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -1448,6 +1493,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectNotequalWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector less
 	
 	public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -1511,6 +1561,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectGreaterequalWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector less equal
 	
 	public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -1573,6 +1628,11 @@ public class LibSpoofPrimitives
 			c[aix[j]] = (a[j] <= b[bi+aix[j]]) ? 1 : 0;
 		return c;
 	}
+	
+	public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectGreaterWrite(b, a, bix, bi, ai, blen, len);
+	}
 
 	//custom vector greater
 	
@@ -1637,6 +1697,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectLessequalWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//custom vector greater equal
 	
 	public static void vectGreaterequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
@@ -1700,6 +1765,11 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	public static double[] vectGreaterequalWrite(double[] a, double[] b, int ai, int[] bix, int bi, int blen, int len) {
+		//invariant to the ordering of inputs
+		return vectLessWrite(b, a, bix, bi, ai, blen, len);
+	}
+	
 	//complex builtin functions that are not directly generated
 	//(included here in order to reduce the number of imports)

[44/50] [abbrv] systemml git commit: [MINOR][SYSTEMML-1979] Fix codegen plan enumeration (cost bound)

Posted by re...@apache.org.

[MINOR][SYSTEMML-1979] Fix codegen plan enumeration (cost bound)

A recent fix re-enabled the structual pruning in the codegen optimizer.
So far, we passed the current cost bound into recursive calls for
conditionally independent subproblems. In rare cases, this lead to
invalid pruning because the subproblem is solved with a constant
assignment for points not included in the subproblem.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/381d1d6a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/381d1d6a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/381d1d6a

Branch: refs/heads/master
Commit: 381d1d6a9ac356d4b834963c71f8872837acf35e
Parents: d907efc
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Oct 30 22:13:33 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 30 22:13:33 2017 -0700

----------------------------------------------------------------------
 .../hops/codegen/opt/PlanSelectionFuseCostBasedV2.java  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/381d1d6a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index 4d8a7bc..1f670b3 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -160,8 +160,8 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			}
 
 			//enumerate and cost plans, returns optional plan
-			boolean[] bestPlan = enumPlans(memo, part, costs, rgraph, 
-					part.getMatPointsExt(), 0, Double.MAX_VALUE);
+			boolean[] bestPlan = enumPlans(memo, part,
+				costs, rgraph, part.getMatPointsExt(), 0);
 			
 			//prune memo table wrt best plan and select plans
 			HashSet<Long> visited = new HashSet<>();
@@ -194,17 +194,17 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 	 * @param rgraph reachability graph of interesting materialization points
 	 * @param matPoints sorted materialization points (defined the search space)
 	 * @param off offset for recursive invocation, indicating the fixed plan part
-	 * @param bestC currently known best plan costs (used of upper bound)
 	 * @return optimal assignment of materialization points
 	 */
 	private static boolean[] enumPlans(CPlanMemoTable memo, PlanPartition part, StaticCosts costs, 
-		ReachabilityGraph rgraph, InterestingPoint[] matPoints, int off, double bestC)
+		ReachabilityGraph rgraph, InterestingPoint[] matPoints, int off)
 	{
 		//scan linearized search space, w/ skips for branch and bound pruning
 		//and structural pruning (where we solve conditionally independent problems)
 		//bestC is monotonically non-increasing and serves as the upper bound
-		long len = UtilFunctions.pow(2, matPoints.length-off);
+		final long len = UtilFunctions.pow(2, matPoints.length-off);
 		boolean[] bestPlan = null;
+		double bestC = Double.MAX_VALUE;
 		long numEvalPlans = 0, numEvalPartPlans = 0;
 		
 		for( long i=0; i<len; i++ ) {
@@ -227,7 +227,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 					if( LOG.isTraceEnabled() )
 						LOG.trace("Enum: Subproblem "+(j+1)+"/"+prob.length+": "+prob[j]);
 					boolean[] bestTmp = enumPlans(memo, part, 
-						costs, null, prob[j].freeMat, prob[j].offset, bestC);
+						costs, null, prob[j].freeMat, prob[j].offset);
 					LibSpoofPrimitives.vectWrite(bestTmp, plan, prob[j].freePos);
 				}

[25/50] [abbrv] systemml git commit: [SYSTEMML-1903, 1968] Fix codegen row templates w/ partial unknowns

Posted by re...@apache.org.

[SYSTEMML-1903,1968] Fix codegen row templates w/ partial unknowns

After recent codegen optimizer changes, GLM was failing during initial
compilation when used through JMLC. The reason was an incorrect handling
of partial unknowns that led to vector operations although the output
was known to be scalar. 


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/8f4ecdce
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/8f4ecdce
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/8f4ecdce

Branch: refs/heads/master
Commit: 8f4ecdce23780a4b820cb79865322d05ba1b9411
Parents: 70ab072
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 24 20:39:22 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Oct 24 20:39:22 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/codegen/template/TemplateRow.java     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/8f4ecdce/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 9da04dc..e14fbd3 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -361,8 +361,9 @@ public class TemplateRow extends TemplateBase
 			// if one input is a matrix then we need to do vector by scalar operations
 			if( (hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1)
 				|| (hop.getInput().get(1).getDim1() > 1 && hop.getInput().get(1).getDim2() > 1)
-				|| (!(hop.dimsKnown() && hop.getInput().get(0).dimsKnown() && hop.getInput().get(1).dimsKnown()) 
-						&& (cdata1.getDataType().isMatrix() || cdata2.getDataType().isMatrix())))
+				|| (!(hop.dimsKnown() && hop.getInput().get(0).dimsKnown() && hop.getInput().get(1).dimsKnown())
+					&& (hop.getDim2() != 1) //not a known vector output
+					&& (cdata1.getDataType().isMatrix() || cdata2.getDataType().isMatrix())))
 			{
 				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
 					if( TemplateUtils.isMatrix(cdata1) && (TemplateUtils.isMatrix(cdata2)

[41/50] [abbrv] systemml git commit: [SYSTEMML-1693] More aggressive function inlining after rewrites

Posted by re...@apache.org.

[SYSTEMML-1693] More aggressive function inlining after rewrites

This patch extends the new IPA pass for function inlining after
rewrites. In addition to inlining small functions, we now also inline
functions that are called once, independent of their size. This is safe
wrt the code size, as inlining can only decrease the code size here.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2896f331
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2896f331
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2896f331

Branch: refs/heads/master
Commit: 2896f3316099241b8074615929424e51cf877d4a
Parents: d75a669
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 29 17:19:49 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 29 17:19:49 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/ipa/IPAPassInlineFunctions.java  |  8 +++-
 .../functions/misc/IPAFunctionInliningTest.java | 20 ++++++++--
 .../scripts/functions/misc/IPAFunInline5.dml    | 40 ++++++++++++++++++++
 3 files changed, 63 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2896f331/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java b/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
index 0527a10..c7ee3e4 100644
--- a/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
+++ b/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
@@ -54,14 +54,18 @@ public class IPAPassInlineFunctions extends IPAPass
 	public void rewriteProgram( DMLProgram prog, FunctionCallGraph fgraph, FunctionCallSizeInfo fcallSizes ) 
 		throws HopsException
 	{
+		//NOTE: we inline single-statement-block (i.e., last-level block) functions
+		//that do not contain other functions, and either are small or called once
+		
 		for( String fkey : fgraph.getReachableFunctions() ) {
 			FunctionStatementBlock fsb = prog.getFunctionStatementBlock(fkey);
 			FunctionStatement fstmt = (FunctionStatement)fsb.getStatement(0);
 			if( fstmt.getBody().size() == 1 
 				&& HopRewriteUtils.isLastLevelStatementBlock(fstmt.getBody().get(0)) 
 				&& !containsFunctionOp(fstmt.getBody().get(0).get_hops())
-				&& countOperators(fstmt.getBody().get(0).get_hops()) 
-					<= InterProceduralAnalysis.INLINING_MAX_NUM_OPS )
+				&& (fgraph.getFunctionCalls(fkey).size() == 1
+					|| countOperators(fstmt.getBody().get(0).get_hops()) 
+						<= InterProceduralAnalysis.INLINING_MAX_NUM_OPS) )
 			{
 				if( LOG.isDebugEnabled() )
 					LOG.debug("IPA: Inline function '"+fkey+"'");

http://git-wip-us.apache.org/repos/asf/systemml/blob/2896f331/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java b/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
index f58d400..39f79c9 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
@@ -33,8 +33,10 @@ public class IPAFunctionInliningTest extends AutomatedTestBase
 {
 	private final static String TEST_NAME1 = "IPAFunInline1"; //pos 1
 	private final static String TEST_NAME2 = "IPAFunInline2"; //pos 2
-	private final static String TEST_NAME3 = "IPAFunInline3"; //neg 1 (too large)
-	private final static String TEST_NAME4 = "IPAFunInline4"; //neg 2 (control flow)
+	private final static String TEST_NAME3 = "IPAFunInline3"; //pos 3 (large but called once)
+	private final static String TEST_NAME4 = "IPAFunInline4"; //neg 1 (control flow)
+	private final static String TEST_NAME5 = "IPAFunInline5"; //neg 2 (large and called twice)
+	
 	
 	private final static String TEST_DIR = "functions/misc/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + IPAFunctionInliningTest.class.getSimpleName() + "/";
@@ -46,6 +48,7 @@ public class IPAFunctionInliningTest extends AutomatedTestBase
 		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "R" }) );
 		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "R" }) );
 		addTestConfiguration( TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] { "R" }) );
+		addTestConfiguration( TEST_NAME5, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5, new String[] { "R" }) );
 	}
 
 	@Test
@@ -69,6 +72,11 @@ public class IPAFunctionInliningTest extends AutomatedTestBase
 	}
 	
 	@Test
+	public void testFunInline5NoIPA() {
+		runIPAFunInlineTest( TEST_NAME5, false );
+	}
+	
+	@Test
 	public void testFunInline1IPA() {
 		runIPAFunInlineTest( TEST_NAME1, true );
 	}
@@ -88,6 +96,11 @@ public class IPAFunctionInliningTest extends AutomatedTestBase
 		runIPAFunInlineTest( TEST_NAME4, true );
 	}
 	
+	@Test
+	public void testFunInline5IPA() {
+		runIPAFunInlineTest( TEST_NAME5, true );
+	}
+	
 	private void runIPAFunInlineTest( String testName, boolean IPA )
 	{
 		boolean oldFlagIPA = OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS;
@@ -112,7 +125,8 @@ public class IPAFunctionInliningTest extends AutomatedTestBase
 			Assert.assertTrue("Wrong result: 7 vs "+val, Math.abs(val-7)<Math.pow(10, -14));
 			
 			//compare inlined functions
-			boolean inlined = ( IPA && (testName.equals(TEST_NAME1) || testName.equals(TEST_NAME2)) );
+			boolean inlined = ( IPA && (testName.equals(TEST_NAME1) 
+				|| testName.equals(TEST_NAME2) || testName.equals(TEST_NAME3)) );
 			Assert.assertTrue("Unexpected function call: "+inlined, !heavyHittersContainsSubString("foo")==inlined);
 		}
 		finally {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2896f331/src/test/scripts/functions/misc/IPAFunInline5.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/IPAFunInline5.dml b/src/test/scripts/functions/misc/IPAFunInline5.dml
new file mode 100644
index 0000000..885468c
--- /dev/null
+++ b/src/test/scripts/functions/misc/IPAFunInline5.dml
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[Double] A, Integer type) return (Matrix[Double] B) {
+  if( type==1 ) {
+    C = (A * A * A) / 3 + 2;
+    D = (A^2 + A^2 + 7) * A;
+    E = min(C, D)
+    B = ((E != 0) * A) * A * A;
+  }
+  else {
+    B = A - 0.1;
+  } 
+}
+
+X = matrix(0.1, rows=100, cols=10);
+Y = foo(X, 1);
+Z = foo(X, 1);
+z = as.matrix((sum(Y)+sum(Z))/2*7);
+
+write(z, $1);

[40/50] [abbrv] systemml git commit: [MINOR] Fix consistency task partitioning in mm, mmchain, codegen row

Posted by re...@apache.org.

[MINOR] Fix consistency task partitioning in mm, mmchain, codegen row

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d75a669a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d75a669a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d75a669a

Branch: refs/heads/master
Commit: d75a669a46381a0a5b54109e7b207613e17ab54e
Parents: 06d5bb0
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 29 16:06:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 29 16:07:05 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/SpoofRowwise.java     | 19 +++----
 .../runtime/matrix/data/LibMatrixMult.java      | 56 ++++++++++++--------
 2 files changed, 42 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/d75a669a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 9d5675b..b0afd88 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -39,7 +39,6 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
 import org.apache.sysml.runtime.matrix.data.SparseRow;
 import org.apache.sysml.runtime.matrix.data.SparseRowVector;
-import org.apache.sysml.runtime.util.UtilFunctions;
 
 
 public abstract class SpoofRowwise extends SpoofOperator
@@ -198,11 +197,9 @@ public abstract class SpoofRowwise extends SpoofOperator
 		
 		//core parallel execute
 		ExecutorService pool = Executors.newFixedThreadPool( k );
-		int nk = (a instanceof CompressedMatrixBlock) ? k :
-			UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
-		int blklen = (int)(Math.ceil((double)m/nk));
-		if( a instanceof CompressedMatrixBlock )
-			blklen = BitmapEncoder.getAlignedBlocksize(blklen);
+		ArrayList<Integer> blklens = (a instanceof CompressedMatrixBlock) ?
+			LibMatrixMult.getAlignedBlockSizes(m, k, BitmapEncoder.BITMAP_BLOCK_SZ) :
+			LibMatrixMult.getBalancedBlockSizesDefault(m, k, false);
 		
 		try
 		{
@@ -210,9 +207,9 @@ public abstract class SpoofRowwise extends SpoofOperator
 				//execute tasks
 				ArrayList<ParColAggTask> tasks = new ArrayList<>();
 				int outLen = out.getNumRows() * out.getNumColumns();
-				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParColAggTask(a, b, scalars, n, n2, outLen, i*blklen, Math.min((i+1)*blklen, m)));
-				List<Future<double[]>> taskret = pool.invokeAll(tasks);	
+				for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
+					tasks.add(new ParColAggTask(a, b, scalars, n, n2, outLen, lb, lb+blklens.get(i)));
+				List<Future<double[]>> taskret = pool.invokeAll(tasks);
 				//aggregate partial results
 				int len = _type.isColumnAgg() ? out.getNumRows()*out.getNumColumns() : 1;
 				for( Future<double[]> task : taskret )
@@ -222,8 +219,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 			else {
 				//execute tasks
 				ArrayList<ParExecTask> tasks = new ArrayList<>();
-				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(a, b, out, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
+				for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
+					tasks.add(new ParExecTask(a, b, out, scalars, n, n2, lb, lb+blklens.get(i)));
 				List<Future<Long>> taskret = pool.invokeAll(tasks);
 				//aggregate nnz, no need to aggregate results
 				long nnz = 0;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d75a669a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 684f327..a1f648e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -209,8 +209,7 @@ public class LibMatrixMult
 		try {
 			ExecutorService pool = Executors.newFixedThreadPool( k );
 			ArrayList<MatrixMultTask> tasks = new ArrayList<>();
-			int nk = (pm2r||pm2c) ? k : UtilFunctions.roundToNext(Math.min(8*k,num/32), k);
-			ArrayList<Integer> blklens = getBalancedBlockSizes(num, nk);
+			ArrayList<Integer> blklens = getBalancedBlockSizesDefault(num, k, (pm2r||pm2c));
 			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
 				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, lb, lb+blklens.get(i)));
 			//execute tasks
@@ -261,7 +260,7 @@ public class LibMatrixMult
 		}
 
 		//Timing time = new Timing(true);
-				
+		
 		//pre-processing: output allocation
 		ret.sparse = false;
 		ret.allocateDenseBlock();
@@ -312,7 +311,7 @@ public class LibMatrixMult
 		}
 		
 		//Timing time = new Timing(true);
-				
+		
 		//pre-processing (no need to check isThreadSafe)
 		ret.sparse = false;
 		ret.allocateDenseBlock();
@@ -321,11 +320,10 @@ public class LibMatrixMult
 		//(currently: always parallelization over number of rows)
 		try {
 			ExecutorService pool = Executors.newFixedThreadPool( k );
+			ArrayList<Integer> blklens = getBalancedBlockSizesDefault(mX.rlen, k, true);
 			ArrayList<MatrixMultChainTask> tasks = new ArrayList<>();
-			int blklen = (int)(Math.ceil((double)mX.rlen/k));
-			blklen += (blklen%24 != 0)?24-blklen%24:0;
-			for( int i=0; i<k & i*blklen<mX.rlen; i++ )
-				tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, i*blklen, Math.min((i+1)*blklen, mX.rlen)));
+			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
+				tasks.add(new MatrixMultChainTask(mX, mV, mW, ct, lb, lb+blklens.get(i)));
 			//execute tasks
 			List<Future<double[]>> taskret = pool.invokeAll(tasks);	
 			pool.shutdown();
@@ -1606,10 +1604,18 @@ public class LibMatrixMult
 		final int blocksizeI = 24; // constraint: factor of 4
 		final int blocksizeJ = 1024;
 		double[] tmp = new double[blocksizeI];
+		final int bn = (ru-rl) % blocksizeI;
+		
+		//compute rest (not aligned to blocksize)
+		for( int i=rl, aix=rl*cd; i < rl+bn; i++, aix+=cd ) {
+			double val = dotProduct(a, b, aix, 0, cd);
+			val *= (weights) ? w[i] : 1;
+			val -= (weights2) ? w[i] : 0;
+			vectMultiplyAdd(val, a, c, aix, 0, cd);
+		}
 		
 		//blockwise mmchain computation
-		final int bn = ru - ru % blocksizeI; //rl blocksize aligned
-		for( int bi=rl; bi < bn; bi+=blocksizeI ) 
+		for( int bi=rl+bn; bi < ru; bi+=blocksizeI ) 
 		{
 			//compute 1st matrix-vector for row block
 			Arrays.fill(tmp, 0);
@@ -1621,10 +1627,10 @@ public class LibMatrixMult
 			
 			//multiply/subtract weights (in-place), if required
 			if( weights ) 
-				vectMultiply(w, tmp, bi, 0, blocksizeI);	
+				vectMultiply(w, tmp, bi, 0, blocksizeI);
 			else if( weights2 )
 				vectSubtract(w, tmp, bi, 0, blocksizeI);
-				
+			
 			//compute 2nd matrix vector for row block and aggregate
 			for( int bj = 0; bj<cd; bj+=blocksizeJ ) {
 				int bjmin = Math.min(cd-bj, blocksizeJ);
@@ -1633,14 +1639,6 @@ public class LibMatrixMult
 						a, c, aix, aix+cd, aix+2*cd, aix+3*cd, bj, bjmin);
 			}
 		}
-		
-		//compute rest (not aligned to blocksize)
-		for( int i=bn, aix=bn*cd; i < ru; i++, aix+=cd ) {
-			double val = dotProduct(a, b, aix, 0, cd);
-			val *= (weights) ? w[i] : 1;
-			val -= (weights2) ? w[i] : 0;
-			vectMultiplyAdd(val, a, c, aix, 0, cd);
-		}
 	}
 
 	private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) 
@@ -3578,9 +3576,9 @@ public class LibMatrixMult
 	
 	public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean inclFLOPs) {
 		return (8L * m1.clen * k <= MEM_OVERHEAD_THRESHOLD 
-			&& (!inclFLOPs || 4L * m1.rlen * m1.clen >= PAR_MINFLOP_THRESHOLD));
+			&& (!inclFLOPs || 4L * m1.rlen * m1.clen / (m1.sparse?2:1) >= PAR_MINFLOP_THRESHOLD));
 	}
-
+	
 	private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {
 		//parallelize over rows in rhs matrix if number of rows in lhs/output is very small
 		return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
@@ -3676,6 +3674,20 @@ public class LibMatrixMult
 		
 	}
 
+	public static ArrayList<Integer> getBalancedBlockSizesDefault(int len, int k, boolean constK) {
+		int nk = constK ? k : UtilFunctions.roundToNext(Math.min(8*k,len/32), k);
+		return getBalancedBlockSizes(len, nk);
+	}
+	
+	public static ArrayList<Integer> getAlignedBlockSizes(int len, int k, int align) {
+		int blklen = (int)(Math.ceil((double)len/k));
+		blklen += ((blklen%align != 0) ? align-blklen%align : 0);
+		ArrayList<Integer> ret = new ArrayList<>();
+		for(int i=0; i<len; i+=blklen)
+			ret.add(Math.min(blklen, len-i));
+		return ret;
+	}
+	
 	private static ArrayList<Integer> getBalancedBlockSizes(int len, int k) {
 		ArrayList<Integer> ret = new ArrayList<>();
 		int base = len / k;

[08/50] [abbrv] systemml git commit: [SYSTEMML-1903] Fix codegen row candidate exploration w/ row indexing

Posted by re...@apache.org.

[SYSTEMML-1903] Fix codegen row candidate exploration w/ row indexing

For Autoencoder w/ batch=512, the codegen fuse-all heuristic was failing
on cplan construction due to unknown row type. The root cause was an
invalid partial fusion plan of type row that covered the row batching
from the overall dataset although row template only support column range
indexing (which applies to all rows).
 

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3a7f38e5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3a7f38e5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3a7f38e5

Branch: refs/heads/master
Commit: 3a7f38e58586cee2044c611ef5b3222fcf9561e5
Parents: 5adb330
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Oct 16 21:45:54 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 16 22:16:36 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/codegen/template/CPlanMemoTable.java     | 2 +-
 src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/3a7f38e5/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index 882cde2..99ffc8d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -439,7 +439,7 @@ public class CPlanMemoTable
 				sb.append(input(i));
 			}
 			if( !isValid() )
-				sb.append(", x");
+				sb.append("|x");
 			sb.append(")");
 			return sb.toString();
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/3a7f38e5/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index b0f46b7..7bbfa52 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -988,7 +988,7 @@ public class HopRewriteUtils
 	
 	public static boolean isColumnRangeIndexing(IndexingOp hop) {
 		return ((isLiteralOfValue(hop.getInput().get(1), 1)
-			&& isLiteralOfValue(hop.getInput().get(2), hop.getDim1()))
+			&& isLiteralOfValue(hop.getInput().get(2), hop.getInput().get(0).getDim1()))
 			|| hop.getDim1() == hop.getInput().get(0).getDim1())
 			&& isLiteralOfValue(hop.getInput().get(3), 1)
 			&& hop.getInput().get(4) instanceof LiteralOp;

[24/50] [abbrv] systemml git commit: [HOTFIX][SYSTEMML-1648] Fix l2svm and msvm algorithm scripts

Posted by re...@apache.org.

[HOTFIX][SYSTEMML-1648] Fix l2svm and msvm algorithm scripts

This patch fixes the recently changed l2svm and msvm algorithm scripts
with regard to (1) use of non-existing variables, (2) corrupted
convergence checks (before update), and (3) various smaller issues
(unused variables, commented code, formatting).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/70ab072a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/70ab072a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/70ab072a

Branch: refs/heads/master
Commit: 70ab072ae764a9abffaead3431ca11e8e1efec68
Parents: a472ae9
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 24 19:48:07 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Oct 24 19:48:07 2017 -0700

----------------------------------------------------------------------
 scripts/algorithms/l2-svm.dml | 18 ++++++++----------
 scripts/algorithms/m-svm.dml  | 18 +++++++-----------
 2 files changed, 15 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/70ab072a/scripts/algorithms/l2-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index 141ef82..2446610 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -134,31 +134,32 @@ while(continue & iter < maxiterations)  {
     h = dd + sum(Xd * sv * Xd)
     step_sz = step_sz - g/h
     
-    continue1 = (gg/h >= 0.0000000001);
+    continue1 = (g*g/h >= 0.0000000001);
   }
 
   #update weights
   w = w + step_sz*s
   Xw = Xw + step_sz*Xd
-	
+  
   out = 1 - Y * Xw
   sv = (out > 0)
   out = sv * out
   obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
   g_new = t(X) %*% (out * Y) - lambda * w
-
+  
   print("ITER " + iter + ": OBJ=" + obj)
   debug_str = append(debug_str, iter + "," + obj)
-	
+  
   tmp = sum(s * g_old)
-  continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
-
+  
   #non-linear CG step
   be = sum(g_new * g_new)/sum(g_old * g_old)
   s = be * s + g_new
   g_old = g_new
-    
+  
+  continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
   iter = iter + 1
+  
 }
 
 extra_model_params = matrix(0, rows=4, cols=1)
@@ -167,11 +168,8 @@ extra_model_params[2,1] = negative_label
 extra_model_params[3,1] = intercept
 extra_model_params[4,1] = dimensions
 
-weights = w
 w = t(cbind(t(w), t(extra_model_params)))
 write(w, $model, format=cmdLine_fmt)
-# write(extra_model_params, " ", format=cmdLine_fmt)
-# write(weights, " ", format=cmdLine_fmt)
 
 logFile = $Log
 if(logFile != " ") {

http://git-wip-us.apache.org/repos/asf/systemml/blob/70ab072a/scripts/algorithms/m-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml
index 6c11811..253764c 100644
--- a/scripts/algorithms/m-svm.dml
+++ b/scripts/algorithms/m-svm.dml
@@ -25,7 +25,7 @@
 # Example Usage:
 # Assume SVM_HOME is set to the home of the dml script
 # Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
+# Assume epsilon = 0.001, lambda=1.0, maxiterations = 100
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
@@ -92,8 +92,8 @@ lambda = cmdLine_reg
 if(lambda < 0)
   stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
 
-max_iterations = cmdLine_maxiter
-if(max_iterations < 1)
+maxiterations = cmdLine_maxiter
+if(maxiterations < 1)
   stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
 
 num_samples = nrow(X)
@@ -110,7 +110,7 @@ if(intercept == 1){
 }
 w = matrix(0, rows=num_rows_in_w, cols=num_classes)
 
-debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+debug_mat = matrix(-1, rows=maxiterations, cols=num_classes)
 
 parfor(iter_class in 1:num_classes){		  
   Y_local = 2 * (Y == iter_class) - 1
@@ -145,7 +145,6 @@ parfor(iter_class in 1:num_classes){
       step_sz = step_sz - g/h
       
       continue1 = (g*g/h >= 0.0000000001)
-      
     }
     
     #update weights
@@ -162,15 +161,14 @@ parfor(iter_class in 1:num_classes){
   
     train_acc = sum(Y_local*(X%*%w_class) >= 0)/num_samples*100
     print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
-    debug_mat[iter+1,iter_class] = obj	   
+    debug_mat[iter+1,iter_class] = obj
    
-    continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
-  	 
     #non-linear CG step
     be = sum(g_new * g_new)/sum(g_old * g_old)
     s = be * s + g_new
     g_old = g_new
     
+    continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
     iter = iter + 1
   }
 
@@ -180,11 +178,9 @@ parfor(iter_class in 1:num_classes){
 extra_model_params = matrix(0, rows=2, cols=ncol(w))
 extra_model_params[1, 1] = intercept
 extra_model_params[2, 1] = dimensions
-weights = w
+
 w = t(cbind(t(w), t(extra_model_params)))
 write(w, $model, format=cmdLine_fmt)
-# write(extra_model_params, " ", format=cmdLine_fmt)
-# write(weights, " ", format=cmdLine_fmt)
 
 debug_str = "# Class, Iter, Obj"
 for(iter_class in 1:ncol(debug_mat)){

[10/50] [abbrv] systemml git commit: [SYSTEMML-1966] Fix codegen plan cache false negatives, cleanups

Posted by re...@apache.org.

[SYSTEMML-1966] Fix codegen plan cache false negatives, cleanups

This patch fixes the codegen plan cache to prevent false negatives due
to different hash codes that are only based on different data nodes.
Furthermore, this also includes some smaller cleanups such as the
compilation of mult2 and pow2 in outer templates as well as the
exploration of valid fusion plans when using fusion heuristics.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/60ad522e
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/60ad522e
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/60ad522e

Branch: refs/heads/master
Commit: 60ad522eb4107e55cd9418cc8494208e6c36e28c
Parents: 259814e
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 17 20:16:33 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Oct 17 21:39:57 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/codegen/SpoofCompiler.java | 12 ++++++++----
 .../apache/sysml/hops/codegen/cplan/CNodeData.java   |  6 +++---
 .../hops/codegen/template/CPlanCSERewriter.java      |  2 +-
 .../hops/codegen/template/TemplateOuterProduct.java  | 15 ++++++++++-----
 .../sysml/hops/codegen/template/TemplateRow.java     |  9 ++++++---
 .../sysml/runtime/matrix/data/LibMatrixMult.java     |  6 +++---
 6 files changed, 31 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 5ff90fb..4af8540 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -139,6 +139,10 @@ public class SpoofCompiler
 			return this == FUSE_ALL
 				|| this == FUSE_NO_REDUNDANCY;
 		}
+		public boolean isCostBased() {
+			return this == FUSE_COST_BASED_V2
+				|| this == FUSE_COST_BASED;
+		}
 	}
 
 	public enum PlanCachePolicy {
@@ -399,14 +403,14 @@ public class SpoofCompiler
 					
 					//explain debug output cplans or generated source code
 					if( LOG.isTraceEnabled() || DMLScript.EXPLAIN.isHopsType(recompile) ) {
-						LOG.info("Codegen EXPLAIN (generated cplan for HopID: " 
-							+ cplan.getKey() + ", line "+tmp.getValue().getBeginLine() + "):");
+						LOG.info("Codegen EXPLAIN (generated cplan for HopID: " + cplan.getKey() + 
+							", line "+tmp.getValue().getBeginLine() + ", hash="+tmp.getValue().hashCode()+"):");
 						LOG.info(tmp.getValue().getClassname()
 							+ Explain.explainCPlan(cplan.getValue().getValue()));
 					}
 					if( LOG.isTraceEnabled() || DMLScript.EXPLAIN.isRuntimeType(recompile) ) {
-						LOG.info("Codegen EXPLAIN (generated code for HopID: "
-							+ cplan.getKey() + ", line "+tmp.getValue().getBeginLine() + "):");
+						LOG.info("Codegen EXPLAIN (generated code for HopID: " + cplan.getKey() + 
+							", line "+tmp.getValue().getBeginLine() + ", hash="+tmp.getValue().hashCode()+"):");
 						LOG.info(src);
 					}
 					

http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
index ce343cf..653b50b 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
@@ -106,9 +106,9 @@ public class CNodeData extends CNode
 	public boolean equals(Object o) {
 		return (o instanceof CNodeData 
 			&& super.equals(o)
-			&& isLiteral() == ((CNodeData)o).isLiteral()
-			&& (isLiteral() || !_strictEquals) ? 
+			&& isLiteral() == ((CNode)o).isLiteral()
+			&& ((isLiteral() || !_strictEquals) ? 
 				_name.equals(((CNodeData)o)._name) : 
-				_hopID == ((CNodeData)o)._hopID);
+				_hopID == ((CNodeData)o)._hopID));
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/hops/codegen/template/CPlanCSERewriter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanCSERewriter.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanCSERewriter.java
index 3d12cfe..318e30a 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanCSERewriter.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanCSERewriter.java
@@ -56,7 +56,7 @@ public class CPlanCSERewriter
 		//step 3: reset data nodes to imprecise comparison
 		tpl.resetVisitStatusOutputs();
 		for( CNode out : outputs )
-			rSetStrictDataNodeComparision(out, true);
+			rSetStrictDataNodeComparision(out, false);
 		tpl.resetVisitStatusOutputs();
 		
 		return tpl;

http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
index f3880b1..256f540 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
@@ -174,6 +174,7 @@ public class TemplateOuterProduct extends TemplateBase {
 		}
 		else if(hop instanceof BinaryOp)
 		{
+			BinaryOp bop = (BinaryOp) hop;
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
 			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
 			String primitiveOpName = ((BinaryOp)hop).getOp().toString();
@@ -186,8 +187,12 @@ public class TemplateOuterProduct extends TemplateBase {
 			//add lookups if required
 			cdata1 = TemplateUtils.wrapLookupIfNecessary(cdata1, hop.getInput().get(0));
 			cdata2 = TemplateUtils.wrapLookupIfNecessary(cdata2, hop.getInput().get(1));
-			
-			out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
+			if( bop.getOp()==OpOp2.POW && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
+				out = new CNodeUnary(cdata1, UnaryType.POW2);
+			else if( bop.getOp()==OpOp2.MULT && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
+				out = new CNodeUnary(cdata1, UnaryType.MULT2);
+			else
+				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
 		}
 		else if(hop instanceof AggBinaryOp)
 		{
@@ -213,14 +218,14 @@ public class TemplateOuterProduct extends TemplateBase {
 			//final left/right matrix mult, see close
 			else {
 				if( cdata1.getDataType().isScalar() )
-					out = new CNodeBinary(cdata2, cdata1, BinType.VECT_MULT_ADD);	
+					out = new CNodeBinary(cdata2, cdata1, BinType.VECT_MULT_ADD);
 				else
-					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);	
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
 			}
 		}
 		else if( HopRewriteUtils.isTransposeOperation(hop) ) 
 		{
-			out = tmp.get(hop.getInput().get(0).getHopID());	
+			out = tmp.get(hop.getInput().get(0).getHopID());
 		}
 		else if( hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getOp() == AggOp.SUM
 			&& ((AggUnaryOp)hop).getDirection() == Direction.RowCol )

http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 64014da..0389983 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -33,6 +33,7 @@ import org.apache.sysml.hops.LiteralOp;
 import org.apache.sysml.hops.ParameterizedBuiltinOp;
 import org.apache.sysml.hops.TernaryOp;
 import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.hops.codegen.cplan.CNode;
 import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
 import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
@@ -83,7 +84,8 @@ public class TemplateRow extends TemplateBase
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
 			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && LibMatrixMult.isSkinnyRightHandSide(
 				hop.getInput().get(0).getDim1(), hop.getInput().get(0).getDim2(), //MM
-				hop.getInput().get(1).getDim1(), hop.getInput().get(1).getDim2())
+				hop.getInput().get(1).getDim1(), hop.getInput().get(1).getDim2(),
+				SpoofCompiler.PLAN_SEL_POLICY.isCostBased())
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1
 				&& !HopRewriteUtils.isOuterProductLikeMM(hop))
 			|| (HopRewriteUtils.isTransposeOperation(hop) && hop.getParent().size()==1
@@ -152,8 +154,9 @@ public class TemplateRow extends TemplateBase
 		//check for fusable but not opening matrix multiply (vect_outer-mult)
 		Hop in1 = hop.getInput().get(0); //transpose
 		Hop in2 = hop.getInput().get(1);
-		return LibMatrixMult.isSkinnyRightHandSide(in1.getDim2(), in1.getDim1(), hop.getDim1(), hop.getDim2())
-			|| LibMatrixMult.isSkinnyRightHandSide(in2.getDim1(), in2.getDim2(), hop.getDim2(), hop.getDim1());
+		boolean inclSizes = SpoofCompiler.PLAN_SEL_POLICY.isCostBased();
+		return LibMatrixMult.isSkinnyRightHandSide(in1.getDim2(), in1.getDim1(), hop.getDim1(), hop.getDim2(), inclSizes)
+			|| LibMatrixMult.isSkinnyRightHandSide(in2.getDim1(), in2.getDim2(), hop.getDim2(), hop.getDim1(), inclSizes);
 	}
 	
 	private static boolean isPartOfValidCumAggChain(Hop hop) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/60ad522e/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index eca26f6..fa4d667 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -3567,13 +3567,13 @@ public class LibMatrixMult
 	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) {
 		//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output 
 		return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-			&& isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen));
+			&& isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen, true));
 	}
 	
 	//note: public for use by codegen for consistency
-	public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen) {
+	public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen, boolean inclCacheSize) {
 		return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
-			&& m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE;
+			&& m2clen < 64 && (!inclCacheSize || 8*m2rlen*m2clen < L2_CACHESIZE);
 	}
 	
 	public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean inclFLOPs) {

[18/50] [abbrv] systemml git commit: [MINOR] Fix analysis of sparse-safeness for codegen cell/magg ops

Posted by re...@apache.org.

[MINOR] Fix analysis of sparse-safeness for codegen cell/magg ops

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c70cb116
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c70cb116
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c70cb116

Branch: refs/heads/master
Commit: c70cb1166f4ec6c79d10248727a3eb7b85f70360
Parents: 78a3808
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 18:57:35 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 18:57:35 2017 -0700

----------------------------------------------------------------------
 .../apache/sysml/hops/codegen/template/TemplateCell.java  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c70cb116/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index c9b0734..4f3d4f4 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -322,10 +322,12 @@ public class TemplateCell extends TemplateBase
 	protected boolean isSparseSafe(List<Hop> roots, Hop mainInput, List<CNode> outputs, List<AggOp> aggOps, boolean onlySum) {
 		boolean ret = true;
 		for( int i=0; i<outputs.size() && ret; i++ ) {
-			ret &= (HopRewriteUtils.isBinary(roots.get(i), OpOp2.MULT) 
-					&& roots.get(i).getInput().contains(mainInput))
-				|| (HopRewriteUtils.isBinary(roots.get(i), OpOp2.DIV) 
-					&& roots.get(i).getInput().get(0) == mainInput)
+			Hop root = (roots.get(i) instanceof AggUnaryOp || roots.get(i) 
+				instanceof AggBinaryOp) ? roots.get(i).getInput().get(0) : roots.get(i);
+			ret &= (HopRewriteUtils.isBinarySparseSafe(root) 
+					&& root.getInput().contains(mainInput))
+				|| (HopRewriteUtils.isBinary(root, OpOp2.DIV) 
+					&& root.getInput().get(0) == mainInput)
 				|| (TemplateUtils.rIsSparseSafeOnly(outputs.get(i), BinType.MULT)
 					&& TemplateUtils.rContainsInput(outputs.get(i), mainInput.getHopID()));
 			if( onlySum )

[35/50] [abbrv] systemml git commit: [SYSTEMML-1952] Fix codegen row ops w/ non-allocated dense side inputs

Posted by re...@apache.org.

[SYSTEMML-1952] Fix codegen row ops w/ non-allocated dense side inputs

This patch fixes special cases with empty (and not allocated) dense side
inputs. Before we always converted sparse and empty dense inputs to
dense blocks. With the introduction of sparse side inputs, the special
case of empty dense blocks was not yet handled correctly. We now simply
construct sparse side inputs and make these robust to work with
non-existing sparse blocks. 

Furthermore, this includes a minor performance improvement, where sparse
row side inputs do not need to reset the temporary row for each access
in case of non-existing sparse blocks.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/897d29d0
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/897d29d0
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/897d29d0

Branch: refs/heads/master
Commit: 897d29d04a8d6b89a7467e65d350aa98362f67b0
Parents: dd513ff
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Oct 26 22:45:06 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 26 23:38:21 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/codegen/SpoofOperator.java      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/897d29d0/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index a614ded..21fdd35 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -104,10 +104,10 @@ public abstract class SpoofOperator implements Serializable
 				else {
 					b[i-offset] = new SideInput(DataConverter.convertToDoubleVector(in), null, clen);
 					LOG.warn(getClass().getName()+": Converted "+in.getNumRows()+"x"+in.getNumColumns()+
-						", nnz="+in.getNonZeros()+" sideways input matrix from sparse to dense.");	
+						", nnz="+in.getNonZeros()+" sideways input matrix from sparse to dense.");
 				}
 			}
-			else if( in.isInSparseFormat() && in.isAllocated() ) {
+			else if( in.isInSparseFormat() || !in.isAllocated() ) {
 				b[i-offset] = new SideInput(null, in, clen);
 			}
 			else {
@@ -128,16 +128,14 @@ public abstract class SpoofOperator implements Serializable
 		boolean containsSparse = false;
 		for( int i=0; i<input.length; i++ ) {
 			SideInput tmp = input[i];
-			containsSparse |= (tmp.mdat != null && tmp.mdat.isInSparseFormat() 
-				&& !tmp.mdat.isEmptyBlock(false) && tmp.clen > 1);
+			containsSparse |= (tmp.mdat != null && tmp.clen > 1);
 		}
 		if( !containsSparse )
 			return input;
 		SideInput[] ret = new SideInput[input.length];
 		for( int i=0; i<input.length; i++ ) {
 			SideInput tmp = input[i];
-			ret[i] = (tmp.mdat != null && tmp.mdat.isInSparseFormat()
-				&& !tmp.mdat.isEmptyBlock(false) && tmp.clen > 1) ?
+			ret[i] = (tmp.mdat != null && tmp.clen > 1) ?
 				(row ? new SideInputSparseRow(tmp) : 
 				new SideInputSparseCell(tmp)) : tmp;
 		}
@@ -274,9 +272,10 @@ public abstract class SpoofOperator implements Serializable
 		
 		private void nextRow(int r) {
 			currRowIndex = r;
-			Arrays.fill(values, 0);
 			SparseBlock sblock = mdat.getSparseBlock();
-			if( sblock != null && !sblock.isEmpty(r) ) {
+			if( sblock == null ) return;
+			Arrays.fill(values, 0);
+			if( !sblock.isEmpty(r) ) {
 				int apos = sblock.pos(r);
 				int alen = sblock.size(r);
 				int[] aix = sblock.indexes(r);
@@ -298,7 +297,8 @@ public abstract class SpoofOperator implements Serializable
 			super(in.ddat, in.mdat, in.clen);
 		}
 		public double next(int rowIndex, int colIndex) {
-			if( mdat.getSparseBlock().isEmpty(rowIndex) )
+			SparseBlock sblock = mdat.getSparseBlock();
+			if( sblock == null || sblock.isEmpty(rowIndex) )
 				return 0;
 			//move to next row if necessary
 			if( rowIndex > currRowIndex ) {

[11/50] [abbrv] systemml git commit: [MINOR] Fix missing warning on truncation of matrix/frame toString

Posted by re...@apache.org.

[MINOR] Fix missing warning on truncation of matrix/frame toString 

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5b8d6265
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5b8d6265
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5b8d6265

Branch: refs/heads/master
Commit: 5b8d62659b2e5727bebcaf0d2681fc4ecd4ea85f
Parents: 60ad522
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Oct 17 20:54:01 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Oct 17 21:39:58 2017 -0700

----------------------------------------------------------------------
 .../cp/ParameterizedBuiltinCPInstruction.java       | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5b8d6265/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
index e8a5f4a..f6532d7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ParameterizedBuiltinCPInstruction.java
@@ -26,6 +26,7 @@ import org.apache.sysml.lops.Lop;
 import org.apache.sysml.parser.ParameterizedBuiltinFunctionExpression;
 import org.apache.sysml.parser.Statement;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.CacheBlock;
 import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.caching.FrameObject;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
@@ -328,10 +329,12 @@ public class ParameterizedBuiltinCPInstruction extends ComputationCPInstruction
 			String out = null;
 			if( data instanceof MatrixObject ) {
 				MatrixBlock matrix = (MatrixBlock) data.acquireRead();
+				warnOnTrunction(matrix, rows, cols);
 				out = DataConverter.toString(matrix, sparse, separator, lineseparator, rows, cols, decimal);
 			}
 			else if( data instanceof FrameObject ) {
 				FrameBlock frame = (FrameBlock) data.acquireRead();
+				warnOnTrunction(frame, rows, cols);
 				out = DataConverter.toString(frame, sparse, separator, lineseparator, rows, cols, decimal);
 			}
 			else {
@@ -342,6 +345,17 @@ public class ParameterizedBuiltinCPInstruction extends ComputationCPInstruction
 		}
 		else {
 			throw new DMLRuntimeException("Unknown opcode : " + opcode);
-		}		
+		}
+	}
+	
+	private void warnOnTrunction(CacheBlock data, int rows, int cols) {
+		//warn on truncation because users might not be aware and use toString for verification
+		if( (getParam("rows")==null && data.getNumRows()>rows)
+			|| (getParam("cols")==null && data.getNumColumns()>cols) )
+		{
+			LOG.warn("Truncating "+data.getClass().getSimpleName()+" of size "
+				+ data.getNumRows()+"x"+data.getNumColumns()+" to "+rows+"x"+cols+". "
+				+ "Use toString(X, rows=..., cols=...) if necessary.");
+		}
 	}
 }

[36/50] [abbrv] systemml git commit: [SYSTEMML-1693] New IPA pass for function inlining after rewrites

Posted by re...@apache.org.

[SYSTEMML-1693] New IPA pass for function inlining after rewrites

The existing function inlining (of single-statement-block functions)
happens during validate, i.e., before rewrites. However, after constant
propagation, constant folding, branch removal, and statement block
merge, often additional opportunities arise. This patch exploits such
opportunities by adding a new inter-procedural analysis pass for
inlining single-statement-block functions. To limit the potential
exponential increase of program size, we only inline functions with less
or equal than 10 operations other than dataops and literals.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/83e01b02
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/83e01b02
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/83e01b02

Branch: refs/heads/master
Commit: 83e01b02891e54612e2ab82d3f2f805eee2f09f1
Parents: 897d29d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 27 20:23:56 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Oct 27 20:23:56 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/ipa/FunctionCallGraph.java       |  31 +++-
 .../sysml/hops/ipa/IPAPassInlineFunctions.java  | 156 +++++++++++++++++++
 .../sysml/hops/ipa/InterProceduralAnalysis.java |   4 +-
 .../org/apache/sysml/parser/DMLTranslator.java  |   8 +-
 .../functions/misc/IPAFunctionInliningTest.java | 122 +++++++++++++++
 .../test/integration/functions/misc/IfTest.java | 155 +++++++++---------
 .../scripts/functions/misc/IPAFunInline1.dml    |  34 ++++
 .../scripts/functions/misc/IPAFunInline2.dml    |  36 +++++
 .../scripts/functions/misc/IPAFunInline3.dml    |  39 +++++
 .../scripts/functions/misc/IPAFunInline4.dml    |  36 +++++
 .../functions/misc/ZPackageSuite.java           |   1 +
 11 files changed, 532 insertions(+), 90 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/main/java/org/apache/sysml/hops/ipa/FunctionCallGraph.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ipa/FunctionCallGraph.java b/src/main/java/org/apache/sysml/hops/ipa/FunctionCallGraph.java
index d719da7..4735f47 100644
--- a/src/main/java/org/apache/sysml/hops/ipa/FunctionCallGraph.java
+++ b/src/main/java/org/apache/sysml/hops/ipa/FunctionCallGraph.java
@@ -55,8 +55,9 @@ public class FunctionCallGraph
 	//program-wide function call operators per target function
 	//(mapping from function keys to set of its function calls)
 	private final HashMap<String, ArrayList<FunctionOp>> _fCalls;
+	private final HashMap<String, ArrayList<StatementBlock>> _fCallsSB;
 	
-	//subset of direct or indirect recursive functions	
+	//subset of direct or indirect recursive functions
 	private final HashSet<String> _fRecursive;
 	
 	/**
@@ -68,6 +69,7 @@ public class FunctionCallGraph
 	public FunctionCallGraph(DMLProgram prog) {
 		_fGraph = new HashMap<>();
 		_fCalls = new HashMap<>();
+		_fCallsSB = new HashMap<>();
 		_fRecursive = new HashSet<>();
 		
 		constructFunctionCallGraph(prog);
@@ -82,6 +84,7 @@ public class FunctionCallGraph
 	public FunctionCallGraph(StatementBlock sb) {
 		_fGraph = new HashMap<>();
 		_fCalls = new HashMap<>();
+		_fCallsSB = new HashMap<>();
 		_fRecursive = new HashSet<>();
 		
 		constructFunctionCallGraph(sb);
@@ -125,6 +128,21 @@ public class FunctionCallGraph
 	}
 	
 	/**
+	 * Returns all statement blocks that contain a function operator
+	 * calling the given function.
+	 * 
+	 * @param fkey function key of called function,
+	 *      null indicates the main program and returns an empty list
+	 * @return list of statement blocks
+	 */
+	public List<StatementBlock> getFunctionCallsSB(String fkey) {
+		//main program cannot have function calls
+		if( fkey == null )
+			return Collections.emptyList();
+		return _fCallsSB.get(fkey);
+	}
+	
+	/**
 	 * Indicates if the given function is either directly or indirectly recursive.
 	 * An example of an indirect recursive function is foo2 in the following call
 	 * chain: foo1 -&gt; foo2 -&gt; foo1.
@@ -135,7 +153,7 @@ public class FunctionCallGraph
 	 */
 	public boolean isRecursiveFunction(String fnamespace, String fname) {
 		return isRecursiveFunction(
-			DMLProgram.constructFunctionKey(fnamespace, fname));			
+			DMLProgram.constructFunctionKey(fnamespace, fname));
 	}
 	
 	/**
@@ -268,9 +286,12 @@ public class FunctionCallGraph
 					FunctionOp fop = (FunctionOp) h;
 					String lfkey = fop.getFunctionKey();
 					//keep all function operators
-					if( !_fCalls.containsKey(lfkey) )
-						_fCalls.put(lfkey, new ArrayList<FunctionOp>());
+					if( !_fCalls.containsKey(lfkey) ) {
+						_fCalls.put(lfkey, new ArrayList<>());
+						_fCallsSB.put(lfkey, new ArrayList<>());
+					}
 					_fCalls.get(lfkey).add(fop);
+					_fCallsSB.get(lfkey).add(sb);
 					
 					//prevent redundant call edges
 					if( lfset.contains(lfkey) || fop.getFunctionNamespace().equals(DMLProgram.INTERNAL_NAMESPACE) )
@@ -278,7 +299,7 @@ public class FunctionCallGraph
 					
 					if( !_fGraph.containsKey(lfkey) )
 						_fGraph.put(lfkey, new HashSet<String>());
-						
+					
 					//recursively construct function call dag
 					if( !fstack.contains(lfkey) ) {
 						fstack.push(lfkey);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java b/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
new file mode 100644
index 0000000..0527a10
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/ipa/IPAPassInlineFunctions.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.ipa;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.hops.DataOp;
+import org.apache.sysml.hops.FunctionOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.HopsException;
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.Hop.DataOpTypes;
+import org.apache.sysml.hops.recompile.Recompiler;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.parser.DMLProgram;
+import org.apache.sysml.parser.FunctionStatement;
+import org.apache.sysml.parser.FunctionStatementBlock;
+import org.apache.sysml.parser.StatementBlock;
+
+/**
+ * This rewrite inlines single statement block functions, which have fewer 
+ * operations than an internal threshold. Function inlining happens during 
+ * validate but after rewrites such as constant folding and branch removal 
+ * there are additional opportunities.
+ * 
+ */
+public class IPAPassInlineFunctions extends IPAPass
+{
+	@Override
+	public boolean isApplicable() {
+		return InterProceduralAnalysis.INLINING_MAX_NUM_OPS > 0;
+	}
+	
+	@Override
+	public void rewriteProgram( DMLProgram prog, FunctionCallGraph fgraph, FunctionCallSizeInfo fcallSizes ) 
+		throws HopsException
+	{
+		for( String fkey : fgraph.getReachableFunctions() ) {
+			FunctionStatementBlock fsb = prog.getFunctionStatementBlock(fkey);
+			FunctionStatement fstmt = (FunctionStatement)fsb.getStatement(0);
+			if( fstmt.getBody().size() == 1 
+				&& HopRewriteUtils.isLastLevelStatementBlock(fstmt.getBody().get(0)) 
+				&& !containsFunctionOp(fstmt.getBody().get(0).get_hops())
+				&& countOperators(fstmt.getBody().get(0).get_hops()) 
+					<= InterProceduralAnalysis.INLINING_MAX_NUM_OPS )
+			{
+				if( LOG.isDebugEnabled() )
+					LOG.debug("IPA: Inline function '"+fkey+"'");
+				
+				//replace all relevant function calls 
+				ArrayList<Hop> hops = fstmt.getBody().get(0).get_hops();
+				List<FunctionOp> fcalls = fgraph.getFunctionCalls(fkey);
+				List<StatementBlock> fcallsSB = fgraph.getFunctionCallsSB(fkey);
+				for(int i=0; i<fcalls.size(); i++) {
+					FunctionOp op = fcalls.get(i);
+					
+					//step 0: robustness for special cases
+					if( op.getInput().size() != fstmt.getInputParams().size()
+						|| op.getOutputVariableNames().length != fstmt.getOutputParams().size() )
+						continue;
+					
+					//step 1: deep copy hop dag
+					ArrayList<Hop> hops2 = Recompiler.deepCopyHopsDag(hops);
+					
+					//step 2: replace inputs
+					HashMap<String,Hop> inMap = new HashMap<>();
+					for(int j=0; j<op.getInput().size(); j++)
+						inMap.put(fstmt.getInputParams().get(j).getName(), op.getInput().get(j));
+					replaceTransientReads(hops2, inMap);
+					
+					//step 3: replace outputs
+					HashMap<String,String> outMap = new HashMap<>();
+					String[] opOutputs = op.getOutputVariableNames();
+					for(int j=0; j<opOutputs.length; j++)
+						outMap.put(fstmt.getOutputParams().get(j).getName(), opOutputs[j]);
+					for(int j=0; j<hops2.size(); j++) {
+						Hop out = hops2.get(j);
+						if( HopRewriteUtils.isData(out, DataOpTypes.TRANSIENTWRITE) )
+							out.setName(outMap.get(out.getName()));
+					}
+					fcallsSB.get(i).get_hops().remove(op);
+					fcallsSB.get(i).get_hops().addAll(hops2);
+				}
+			}
+		}
+	}
+	
+	private static boolean containsFunctionOp(ArrayList<Hop> hops) {
+		if( hops==null || hops.isEmpty() )
+			return false;
+		Hop.resetVisitStatus(hops);
+		boolean ret = HopRewriteUtils.containsOp(hops, FunctionOp.class);
+		Hop.resetVisitStatus(hops);
+		return ret;
+	}
+	
+	private static int countOperators(ArrayList<Hop> hops) {
+		if( hops==null || hops.isEmpty() )
+			return 0;
+		Hop.resetVisitStatus(hops);
+		int count = 0;
+		for( Hop hop : hops )
+			count += rCountOperators(hop);
+		Hop.resetVisitStatus(hops);
+		return count;
+	}
+	
+	private static int rCountOperators(Hop current) {
+		if( current.isVisited() )
+			return 0;
+		int count = !(current instanceof DataOp 
+			|| current instanceof LiteralOp) ? 1 : 0;
+		for( Hop c : current.getInput() )
+			count += rCountOperators(c);
+		current.setVisited();
+		return count;
+	}
+	
+	private static void replaceTransientReads(ArrayList<Hop> hops, HashMap<String, Hop> inMap) {
+		Hop.resetVisitStatus(hops);
+		for( Hop hop : hops )
+			rReplaceTransientReads(hop, inMap);
+		Hop.resetVisitStatus(hops);
+	}
+	
+	private static void rReplaceTransientReads(Hop current, HashMap<String, Hop> inMap) {
+		if( current.isVisited() )
+			return;
+		for( int i=0; i<current.getInput().size(); i++ ) {
+			Hop c = current.getInput().get(i);
+			rReplaceTransientReads(c, inMap);
+			if( HopRewriteUtils.isData(c, DataOpTypes.TRANSIENTREAD) )
+				HopRewriteUtils.replaceChildReference(current, c, inMap.get(c.getName()));
+		}
+		current.setVisited();
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/main/java/org/apache/sysml/hops/ipa/InterProceduralAnalysis.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ipa/InterProceduralAnalysis.java b/src/main/java/org/apache/sysml/hops/ipa/InterProceduralAnalysis.java
index 65f7e54..2ab6b3c 100644
--- a/src/main/java/org/apache/sysml/hops/ipa/InterProceduralAnalysis.java
+++ b/src/main/java/org/apache/sysml/hops/ipa/InterProceduralAnalysis.java
@@ -96,12 +96,13 @@ public class InterProceduralAnalysis
 	protected static final boolean PROPAGATE_SCALAR_VARS_INTO_FUN = true; //propagate scalar variables into functions that are called once
 	protected static final boolean PROPAGATE_SCALAR_LITERALS      = true; //propagate and replace scalar literals into functions
 	protected static final boolean APPLY_STATIC_REWRITES          = true; //apply static hop dag and statement block rewrites
+	protected static final int     INLINING_MAX_NUM_OPS           = 10;    //inline single-statement functions w/ #ops <= threshold, other than dataops and literals
 	
 	static {
 		// for internal debugging only
 		if( LDEBUG ) {
 			Logger.getLogger("org.apache.sysml.hops.ipa.InterProceduralAnalysis")
-				  .setLevel((Level) Level.DEBUG);
+				.setLevel((Level) Level.DEBUG);
 		}
 	}
 	
@@ -136,6 +137,7 @@ public class InterProceduralAnalysis
 		_passes.add(new IPAPassRemoveConstantBinaryOps());
 		_passes.add(new IPAPassPropagateReplaceLiterals());
 		_passes.add(new IPAPassApplyStaticHopRewrites());
+		_passes.add(new IPAPassInlineFunctions());
 	}
 	
 	public InterProceduralAnalysis(StatementBlock sb) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index 565c367..75103d1 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -1451,12 +1451,8 @@ public class DMLTranslator
 					}
 
 					//create function op
-					String[] foutputs = new String[mas.getTargetList().size()]; 
-					int count = 0;
-					for ( DataIdentifier paramName : mas.getTargetList() ){
-						foutputs[count++]=paramName.getName();
-					}
-					
+					String[] foutputs = mas.getTargetList().stream()
+						.map(d -> d.getName()).toArray(String[]::new);
 					FunctionType ftype = fsb.getFunctionOpType();
 					FunctionOp fcall = new FunctionOp(ftype, fci.getNamespace(), fci.getName(), finputs, foutputs, false);
 					output.add(fcall);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java b/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
new file mode 100644
index 0000000..f58d400
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/misc/IPAFunctionInliningTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.misc;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class IPAFunctionInliningTest extends AutomatedTestBase 
+{
+	private final static String TEST_NAME1 = "IPAFunInline1"; //pos 1
+	private final static String TEST_NAME2 = "IPAFunInline2"; //pos 2
+	private final static String TEST_NAME3 = "IPAFunInline3"; //neg 1 (too large)
+	private final static String TEST_NAME4 = "IPAFunInline4"; //neg 2 (control flow)
+	
+	private final static String TEST_DIR = "functions/misc/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + IPAFunctionInliningTest.class.getSimpleName() + "/";
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "R" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "R" }) );
+		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "R" }) );
+		addTestConfiguration( TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] { "R" }) );
+	}
+
+	@Test
+	public void testFunInline1NoIPA() {
+		runIPAFunInlineTest( TEST_NAME1, false );
+	}
+	
+	@Test
+	public void testFunInline2NoIPA() {
+		runIPAFunInlineTest( TEST_NAME2, false );
+	}
+	
+	@Test
+	public void testFunInline3NoIPA() {
+		runIPAFunInlineTest( TEST_NAME3, false );
+	}
+	
+	@Test
+	public void testFunInline4NoIPA() {
+		runIPAFunInlineTest( TEST_NAME4, false );
+	}
+	
+	@Test
+	public void testFunInline1IPA() {
+		runIPAFunInlineTest( TEST_NAME1, true );
+	}
+	
+	@Test
+	public void testFunInline2IPA() {
+		runIPAFunInlineTest( TEST_NAME2, true );
+	}
+	
+	@Test
+	public void testFunInline3IPA() {
+		runIPAFunInlineTest( TEST_NAME3, true );
+	}
+	
+	@Test
+	public void testFunInline4IPA() {
+		runIPAFunInlineTest( TEST_NAME4, true );
+	}
+	
+	private void runIPAFunInlineTest( String testName, boolean IPA )
+	{
+		boolean oldFlagIPA = OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS;
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testName);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testName + ".dml";
+			programArgs = new String[]{"-explain", "-stats", "-args", output("R") };
+			
+			fullRScriptName = HOME + testName + ".R";
+			rCmd = getRCmd(expectedDir());
+
+			OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS = IPA;
+
+			//run script and compare output
+			runTest(true, false, null, -1); 
+			double val = readDMLMatrixFromHDFS("R").get(new CellIndex(1,1));
+			Assert.assertTrue("Wrong result: 7 vs "+val, Math.abs(val-7)<Math.pow(10, -14));
+			
+			//compare inlined functions
+			boolean inlined = ( IPA && (testName.equals(TEST_NAME1) || testName.equals(TEST_NAME2)) );
+			Assert.assertTrue("Unexpected function call: "+inlined, !heavyHittersContainsSubString("foo")==inlined);
+		}
+		finally {
+			OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS = oldFlagIPA;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/java/org/apache/sysml/test/integration/functions/misc/IfTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/misc/IfTest.java b/src/test/java/org/apache/sysml/test/integration/functions/misc/IfTest.java
index 82635ae..08deba4 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/misc/IfTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/misc/IfTest.java
@@ -26,82 +26,81 @@ import org.junit.Test;
 
 public class IfTest extends AutomatedTestBase
 {
-
-    private final static String TEST_DIR = "functions/misc/";
-    private final static String TEST_NAME1 = "IfTest";
-    private final static String TEST_NAME2 = "IfTest2";
-    private final static String TEST_NAME3 = "IfTest3";
-    private final static String TEST_NAME4 = "IfTest4";
-    private final static String TEST_NAME5 = "IfTest5";
-    private final static String TEST_NAME6 = "IfTest6";
-    private final static String TEST_CLASS_DIR = TEST_DIR + IfTest.class.getSimpleName() + "/";
-
-    @Override
-    public void setUp()
-    {
-        addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {}));
-        addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] {}));
-        addTestConfiguration(TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] {}));
-        addTestConfiguration(TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] {}));
-        addTestConfiguration(TEST_NAME5, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5, new String[] {}));
-        addTestConfiguration(TEST_NAME6, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME6, new String[] {}));
-    }
-
-    @Test
-    public void testIf() { runTest(TEST_NAME1, 1); }
-
-    @Test
-    public void testIfElse() {
-        runTest(TEST_NAME2, 1);
-        runTest(TEST_NAME2, 2);
-    }
-
-    @Test
-    public void testIfElif() {
-        runTest(TEST_NAME3, 1);
-        runTest(TEST_NAME3, 2);
-    }
-
-    @Test
-    public void testIfElifElse() {
-        runTest(TEST_NAME4, 1);
-        runTest(TEST_NAME4, 2);
-        runTest(TEST_NAME4, 3);
-    }
-
-    @Test
-    public void testIfElifElif() {
-        runTest(TEST_NAME5, 1);
-        runTest(TEST_NAME5, 2);
-        runTest(TEST_NAME5, 3);
-    }
-
-    @Test
-    public void testIfElifElifElse() {
-        runTest(TEST_NAME6, 1);
-        runTest(TEST_NAME6, 2);
-        runTest(TEST_NAME6, 3);
-        runTest(TEST_NAME6, 4);
-    }
-
-    private void runTest( String testName, int val )
-    {
-        TestConfiguration config = getTestConfiguration(testName);
-        loadTestConfiguration(config);
-
-        String HOME = SCRIPT_DIR + TEST_DIR;
-        fullDMLScriptName = HOME + testName + ".pydml";
-        programArgs = new String[]{"-python","-nvargs","val=" + Integer.toString(val)};
-
-        if (val == 1)
-            setExpectedStdOut("A");
-        else if (val == 2)
-            setExpectedStdOut("B");
-        else if (val == 3)
-            setExpectedStdOut("C");
-        else
-            setExpectedStdOut("D");
-
-        runTest(true, false, null, -1);
-    }
+	private final static String TEST_DIR = "functions/misc/";
+	private final static String TEST_NAME1 = "IfTest";
+	private final static String TEST_NAME2 = "IfTest2";
+	private final static String TEST_NAME3 = "IfTest3";
+	private final static String TEST_NAME4 = "IfTest4";
+	private final static String TEST_NAME5 = "IfTest5";
+	private final static String TEST_NAME6 = "IfTest6";
+	private final static String TEST_CLASS_DIR = TEST_DIR + IfTest.class.getSimpleName() + "/";
+
+	@Override
+	public void setUp()
+	{
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] {}));
+		addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] {}));
+		addTestConfiguration(TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] {}));
+		addTestConfiguration(TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] {}));
+		addTestConfiguration(TEST_NAME5, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5, new String[] {}));
+		addTestConfiguration(TEST_NAME6, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME6, new String[] {}));
+	}
+
+	@Test
+	public void testIf() { runTest(TEST_NAME1, 1); }
+
+	@Test
+	public void testIfElse() {
+		runTest(TEST_NAME2, 1);
+		runTest(TEST_NAME2, 2);
+	}
+
+	@Test
+	public void testIfElif() {
+		runTest(TEST_NAME3, 1);
+		runTest(TEST_NAME3, 2);
+	}
+
+	@Test
+	public void testIfElifElse() {
+		runTest(TEST_NAME4, 1);
+		runTest(TEST_NAME4, 2);
+		runTest(TEST_NAME4, 3);
+	}
+
+	@Test
+	public void testIfElifElif() {
+		runTest(TEST_NAME5, 1);
+		runTest(TEST_NAME5, 2);
+		runTest(TEST_NAME5, 3);
+	}
+
+	@Test
+	public void testIfElifElifElse() {
+		runTest(TEST_NAME6, 1);
+		runTest(TEST_NAME6, 2);
+		runTest(TEST_NAME6, 3);
+		runTest(TEST_NAME6, 4);
+	}
+
+	private void runTest( String testName, int val )
+	{
+		TestConfiguration config = getTestConfiguration(testName);
+		loadTestConfiguration(config);
+
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + testName + ".pydml";
+		programArgs = new String[]{"-python","-nvargs","val=" + Integer.toString(val)};
+
+		if (val == 1)
+			setExpectedStdOut("A");
+		else if (val == 2)
+			setExpectedStdOut("B");
+		else if (val == 3)
+			setExpectedStdOut("C");
+		else
+			setExpectedStdOut("D");
+
+		runTest(true, false, null, -1);
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/scripts/functions/misc/IPAFunInline1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/IPAFunInline1.dml b/src/test/scripts/functions/misc/IPAFunInline1.dml
new file mode 100644
index 0000000..6492502
--- /dev/null
+++ b/src/test/scripts/functions/misc/IPAFunInline1.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[Double] A, Integer type) return (Matrix[Double] B) {
+  if( type==1 )
+    B = A * A * A;
+  else
+    B = A - 0.1;
+}
+
+X = matrix(0.1, rows=100, cols=10);
+Y = foo(X, 1);
+z = as.matrix(sum(Y)*7);
+
+write(z, $1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/scripts/functions/misc/IPAFunInline2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/IPAFunInline2.dml b/src/test/scripts/functions/misc/IPAFunInline2.dml
new file mode 100644
index 0000000..3f7c36d
--- /dev/null
+++ b/src/test/scripts/functions/misc/IPAFunInline2.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[Double] A, Integer type) return (Matrix[Double] B) {
+  if( type==1 ) {
+    T = matrix(as.scalar(A[1,1]), nrow(A), ncol(A));
+    B = T * T * T;
+  }
+  else
+    B = A - 0.1;
+}
+
+X = matrix(0.1, rows=100, cols=10);
+Y = foo(X, 1);
+z = as.matrix(sum(Y)*7);
+
+write(z, $1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/scripts/functions/misc/IPAFunInline3.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/IPAFunInline3.dml b/src/test/scripts/functions/misc/IPAFunInline3.dml
new file mode 100644
index 0000000..f384717
--- /dev/null
+++ b/src/test/scripts/functions/misc/IPAFunInline3.dml
@@ -0,0 +1,39 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[Double] A, Integer type) return (Matrix[Double] B) {
+  if( type==1 ) {
+    C = (A * A * A) / 3 + 2;
+    D = (A^2 + A^2 + 7) * A;
+    E = min(C, D)
+    B = ((E != 0) * A) * A * A;
+  }
+  else {
+    B = A - 0.1;
+  } 
+}
+
+X = matrix(0.1, rows=100, cols=10);
+Y = foo(X, 1);
+z = as.matrix(sum(Y)*7);
+
+write(z, $1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test/scripts/functions/misc/IPAFunInline4.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/IPAFunInline4.dml b/src/test/scripts/functions/misc/IPAFunInline4.dml
new file mode 100644
index 0000000..42dd29c
--- /dev/null
+++ b/src/test/scripts/functions/misc/IPAFunInline4.dml
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[Double] A, Integer type) return (Matrix[Double] B) {
+  for(i in 1:2) {
+    if( type==1 )
+      B = A * A * A;
+    else
+      B = A - 0.1; 
+  } 
+}
+
+X = matrix(0.1, rows=100, cols=10);
+Y = foo(X, 1);
+z = as.matrix(sum(Y)*7);
+
+write(z, $1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/83e01b02/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
index e3833f4..cac39e1 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
@@ -36,6 +36,7 @@ import org.junit.runners.Suite;
 	InvalidFunctionAssignmentTest.class,
 	InvalidFunctionSignatureTest.class,
 	IPAConstantFoldingScalarVariablePropagationTest.class,
+	IPAFunctionInliningTest.class,
 	IPALiteralReplacementTest.class,
 	IPANnzPropagationTest.class,
 	IPAScalarRecursionTest.class,

[27/50] [abbrv] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by re...@apache.org.

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
index 21a2a35..d962027 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java
@@ -23,13 +23,6 @@ import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
 import static jcuda.runtime.JCuda.cudaMemcpy;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 import jcuda.Pointer;
-import jcuda.Sizeof;
-import jcuda.jcublas.JCublas2;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcublas.cublasOperation;
-import jcuda.jcusparse.JCusparse;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -43,6 +36,11 @@ import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.Statistics;
 
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcublas.cublasOperation;
+import jcuda.runtime.JCuda;
+
 public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 	private static final Log LOG = LogFactory.getLog(LibMatrixCuMatMult.class.getName());
@@ -175,7 +173,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 
 			// Step 3: Invoke the kernel
 			long t1 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCusparse.cusparseDcsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
+			cudaSupportFunctions.cusparsecsrgemm(getCusparseHandle(gCtx), transa, transb, params.m, params.n, params.k, A.descr,
 					(int) A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int) B.nnz, B.val, B.rowPtr, B.colInd, C.descr,
 					C.val, C.rowPtr, C.colInd);
 			if (GPUStatistics.DISPLAY_STATISTICS)
@@ -239,7 +237,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 	 * allocated in dense row-major format and A is sparse.
 	 * 
 	 * Other than input and output, this method requires additional memory =
-	 * outRLen * outCLen * Sizeof.DOUBLE
+	 * outRLen * outCLen * sizeOfDataType
 	 * 
 	 * @param gCtx
 	 *            a valid {@link GPUContext}
@@ -276,7 +274,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		// t(C) = t(B) %*% t(A)
 		Pointer output = null;
 		if (outRLen != 1 && outCLen != 1) {
-			output = gCtx.allocate(outRLen * outCLen * Sizeof.DOUBLE);
+			output = gCtx.allocate(outRLen * outCLen * sizeOfDataType);
 		} else {
 			// no transpose required for vector output
 			output = C;
@@ -287,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 		if (outRLen != 1 && outCLen != 1) {
 			// Transpose: C = t(output)
 			long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
-			JCublas2.cublasDgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
+			cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
 					toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
 					toInt(outRLen), C, toInt(outCLen));
 			if (!DMLScript.EAGER_CUDA_FREE)
@@ -331,7 +329,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int m = toInt(param.rightNumRows);
 			int n = toInt(param.rightNumCols);
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
-			JCusparse.cusparseDcsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
+			cudaSupportFunctions.cusparsecsrmv(handle, transa, m, n, toInt(B.nnz), one(), B.descr, B.val, B.rowPtr, B.colInd, A,
 					zero(), C);
 			kernel = GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
@@ -342,7 +340,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			int transa = reverseCusparseOp(cusparseOp(param.isLeftTransposed));
 			int transb = cusparseOp(param.isRightTransposed);
 			LOG.debug(" GPU Sparse-Dense Matrix Multiply (rhs transpose) ");
-			JCusparse.cusparseDcsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
+			cudaSupportFunctions.cusparsecsrmm2(handle, transa, transb, m, param.n, k, toInt(B.nnz), one(), B.descr, B.val,
 					B.rowPtr, B.colInd, A, param.ldb, zero(), C, param.ldc);
 		}
 		if (GPUStatistics.DISPLAY_STATISTICS)
@@ -383,7 +381,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// Vector product
 			LOG.debug(" GPU Dense-dense Vector Product");
 			double[] result = { 0 };
-			JCublas2.cublasDdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
+			cudaSupportFunctions.cublasdot(handle, param.k, A, 1, B, 1, Pointer.to(result));
 			// By default in CuBlas V2, cublas pointer mode is set to
 			// CUBLAS_POINTER_MODE_HOST.
 			// This means that scalar values passed are on host (as opposed to
@@ -391,7 +389,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			// The result is copied from the host back to the device so that the
 			// rest of
 			// infrastructure can treat it uniformly.
-			cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			cudaMemcpy(C, Pointer.to(result), 1 * sizeOfDataType, cudaMemcpyHostToDevice);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_DOT_LIB;
 		} else if (param.m == 1) {
 			// Vector-matrix multiply
@@ -399,18 +397,18 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA {
 			transb = reverseCublasOp(transb);
 			int rightNumRows = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.k : param.n;
 			int rightNumCols = (transb == CUSPARSE_OPERATION_TRANSPOSE) ? param.n : param.k;
-			JCublas2.cublasDgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transb, rightNumRows, rightNumCols, one(), B, param.ldb, A, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB;
 		} else if (param.n == 1) {
 			// Matrix-vector multiply
 			LOG.debug(" GPU Dense Matrix-Vector Multiply");
 			int leftNumRows = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.m : param.k;
 			int leftNumCols = (transa == CUSPARSE_OPERATION_NON_TRANSPOSE) ? param.k : param.m;
-			JCublas2.cublasDgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
+			cudaSupportFunctions.cublasgemv(handle, transa, leftNumRows, leftNumCols, one(), A, param.lda, B, 1, zero(), C, 1);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB;
 		} else {
 			LOG.debug(" GPU Dense-Dense Matrix Multiply ");
-			JCublas2.cublasDgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
+			cudaSupportFunctions.cublasgemm(handle, transa, transb, param.m, param.n, param.k, one(), A, param.lda, B, param.ldb,
 					zero(), C, param.ldc);
 			kernel = GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB;
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8ee6f8d..c023890 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -3852,8 +3852,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * @param ru row upper
 	 * @param cl column lower
 	 * @param cu column upper
-	 * @param ret ?
-	 * @return matrix block
+	 * @param deep should perform deep copy
+	 * @param ret output matrix block
+	 * @return matrix block output matrix block
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public MatrixBlock sliceOperations(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
new file mode 100644
index 0000000..128bb39
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.runtime.JCuda.cudaMemcpy;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
+import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.utils.GPUStatistics;
+
+import jcuda.Pointer;
+import jcuda.Sizeof;
+import jcuda.jcublas.JCublas2;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcusolver.JCusolverDn;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusparse.JCusparse;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.jcusparse.cusparseMatDescr;
+
+public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions {
+	
+	private static final Log LOG = LogFactory.getLog(SinglePrecisionCudaSupportFunctions.class.getName());
+
+	@Override
+	public int cusparsecsrgemm(cusparseHandle handle, int transA, int transB, int m, int n, int k,
+			cusparseMatDescr descrA, int nnzA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA,
+			cusparseMatDescr descrB, int nnzB, Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB,
+			cusparseMatDescr descrC, Pointer csrValC, Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgemm(handle, transA,  transB,  m,  n,  k,
+				 descrA,  nnzA,  csrValA,  csrRowPtrA,  csrColIndA,
+				 descrB,  nnzB,  csrValB,  csrRowPtrB,  csrColIndB,
+				 descrC,  csrValC,  csrRowPtrC,  csrColIndC);
+	}
+
+	@Override
+	public int cublasgeam(cublasHandle handle, int transa, int transb, int m, int n, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer B, int ldb, Pointer C, int ldc) {
+		return JCublas2.cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsrmv(cusparseHandle handle, int transA, int m, int n, int nnz, Pointer alpha,
+			cusparseMatDescr descrA, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer x, Pointer beta,
+			Pointer y) {
+		return JCusparse.cusparseScsrmv(handle, transA, m, n, nnz, alpha, 
+				descrA, csrValA, csrRowPtrA, csrColIndA, x, beta, y);
+	}
+	
+	@Override
+	public int	cusparsecsrmm2(cusparseHandle handle, int transa, int transb, int m, int n, int k, int nnz, jcuda.Pointer alpha, cusparseMatDescr descrA, 
+			jcuda.Pointer csrValA, jcuda.Pointer csrRowPtrA, jcuda.Pointer csrColIndA, 
+			jcuda.Pointer B, int ldb, jcuda.Pointer beta, jcuda.Pointer C, int ldc) {
+		return JCusparse.cusparseScsrmm2(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, 
+				csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasdot(cublasHandle handle, int n, Pointer x, int incx, Pointer y, int incy, Pointer result) {
+		return JCublas2.cublasSdot(handle, n, x, incx, y, incy, result);
+	}
+
+	@Override
+	public int cublasgemv(cublasHandle handle, int trans, int m, int n, Pointer alpha, Pointer A, int lda, Pointer x,
+			int incx, Pointer beta, Pointer y, int incy) {
+		return JCublas2.cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+	}
+
+	@Override
+	public int cublasgemm(cublasHandle handle, int transa, int transb, int m, int n, int k, Pointer alpha, Pointer A,
+			int lda, Pointer B, int ldb, Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	}
+
+	@Override
+	public int cusparsecsr2csc(cusparseHandle handle, int m, int n, int nnz, Pointer csrVal, Pointer csrRowPtr,
+			Pointer csrColInd, Pointer cscVal, Pointer cscRowInd, Pointer cscColPtr, int copyValues, int idxBase) {
+		return JCusparse.cusparseScsr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, cscColPtr, copyValues, idxBase);
+	}
+
+	@Override
+	public int cublassyrk(cublasHandle handle, int uplo, int trans, int n, int k, Pointer alpha, Pointer A, int lda,
+			Pointer beta, Pointer C, int ldc) {
+		return JCublas2.cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+	}
+
+	@Override
+	public int cublasaxpy(cublasHandle handle, int n, Pointer alpha, Pointer x, int incx, Pointer y, int incy) {
+		return JCublas2.cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+	}
+
+	@Override
+	public int cublastrsm(cublasHandle handle, int side, int uplo, int trans, int diag, int m, int n, Pointer alpha,
+			Pointer A, int lda, Pointer B, int ldb) {
+		return JCublas2.cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+	}
+
+	@Override
+	public int cusolverDngeqrf_bufferSize(cusolverDnHandle handle, int m, int n, Pointer A, int lda, int[] Lwork) {
+		return JCusolverDn.cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+	}
+
+	@Override
+	public int cusolverDngeqrf(cusolverDnHandle handle, int m, int n, Pointer A, int lda, Pointer TAU,
+			Pointer Workspace, int Lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+	}
+	
+	@Override
+	public int cusolverDnormqr(cusolverDnHandle handle, int side, int trans, int m, int n, int k, Pointer A, int lda,
+			Pointer tau, Pointer C, int ldc, Pointer work, int lwork, Pointer devInfo) {
+		return JCusolverDn.cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+	}
+
+	@Override
+	public int cusparsecsrgeam(cusparseHandle handle, int m, int n, Pointer alpha, cusparseMatDescr descrA, int nnzA,
+			Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA, Pointer beta, cusparseMatDescr descrB, int nnzB,
+			Pointer csrValB, Pointer csrRowPtrB, Pointer csrColIndB, cusparseMatDescr descrC, Pointer csrValC,
+			Pointer csrRowPtrC, Pointer csrColIndC) {
+		return JCusparse.cusparseScsrgeam(handle, m, n, alpha, descrA, nnzA, 
+				csrValA, csrRowPtrA, csrColIndA, beta, descrB, nnzB, 
+				csrValB, csrRowPtrB, csrColIndB, descrC, csrValC, csrRowPtrC, csrColIndC);
+	}
+
+	@Override
+	public int cusparsecsr2dense(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer csrValA,
+			Pointer csrRowPtrA, Pointer csrColIndA, Pointer A, int lda) {
+		return JCusparse.cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+	}
+	
+	@Override
+	public int cusparsedense2csr(cusparseHandle handle, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRow, Pointer csrValA, Pointer csrRowPtrA, Pointer csrColIndA) {
+		return JCusparse.cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA);
+	}
+	
+	@Override
+	public int cusparsennz(cusparseHandle handle, int dirA, int m, int n, cusparseMatDescr descrA, Pointer A, int lda,
+			Pointer nnzPerRowCol, Pointer nnzTotalDevHostPtr) {
+		return JCusparse.cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+	}
+	
+	@Override
+	public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName) throws DMLRuntimeException {
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		LOG.debug("Potential OOM: Allocated additional space in deviceToHost");
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE);
+			LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length);
+			cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			// TODO: Perform conversion on GPU using double2float and float2double kernels
+			float [] floatData = new float[dest.length];
+			cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
+			for(int i = 0; i < dest.length; i++) {
+				dest[i] = floatData[i];
+			}
+		}
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1);
+	}
+
+	@Override
+	public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) throws DMLRuntimeException {
+		LOG.debug("Potential OOM: Allocated additional space in hostToDevice");
+		// TODO: Perform conversion on GPU using double2float and float2double kernels
+		long t1 = GPUStatistics.DISPLAY_STATISTICS  && instName != null? System.nanoTime() : 0;
+		if(PERFORM_CONVERSION_ON_DEVICE) {
+			Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE);
+			cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice);
+			LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length);
+			gCtx.cudaFreeHelper(deviceDoubleData);
+		}
+		else {
+			float [] floatData = new float[src.length];
+			for(int i = 0; i < src.length; i++) {
+				floatData[i] = (float) src[i];
+			}
+			cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice);
+		}
+		
+		if(GPUStatistics.DISPLAY_STATISTICS && instName != null) 
+			GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1);
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index b4e4b62..d7d1ad5 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -51,9 +51,14 @@ public abstract class GPUTests extends AutomatedTestBase {
 	
 	protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext";
 	protected static SparkSession spark;
-	protected final double THRESHOLD = 1e-9;    // for relative error
+	protected final double DOUBLE_PRECISION_THRESHOLD = 1e-9;    // for relative error
 	private static final boolean PRINT_MAT_ERROR = false;
 	
+	// We will use this flag until lower precision is supported on CP. 
+	private final static String DATA_TYPE = "double";  
+	protected final double SINGLE_PRECISION_THRESHOLD = 1e-3;    // for relative error
+	
+	
 	@BeforeClass
 	public static void beforeClass() {
 		spark = createSystemMLSparkSession("GPUTests", "local");
@@ -70,7 +75,9 @@ public abstract class GPUTests extends AutomatedTestBase {
 	 * @return a valid threshold
 	 */
 	protected double getTHRESHOLD() {
-		return THRESHOLD;
+		if(DATA_TYPE.equals("double"))  return DOUBLE_PRECISION_THRESHOLD;
+		else if(DATA_TYPE.equals("float"))  return SINGLE_PRECISION_THRESHOLD;
+		else throw new RuntimeException("Unsupported datatype:" + DATA_TYPE);
 	}
 
 	@After
@@ -228,7 +235,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	}
 
 	/**
-	 * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other
+	 * Asserts that the values in two matrices are in {@link UnaryOpTests#DOUBLE_PRECISION_THRESHOLD} of each other
 	 *
 	 * @param expected expected matrix
 	 * @param actual   actual matrix
@@ -251,11 +258,15 @@ public abstract class GPUTests extends AutomatedTestBase {
 					double actualDouble = actualMB.quickGetValue(i, j);
 					if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
 						double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+						double absoluteError = Math.abs(expectedDouble - actualDouble);
 						Formatter format = new Formatter();
 						format.format(
 								"Relative error(%f) is more than threshold (%f). Expected = %f, Actual = %f, differed at [%d, %d]",
 								relativeError, getTHRESHOLD(), expectedDouble, actualDouble, i, j);
-						Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						if(DATA_TYPE.equals("double"))
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
+						else
+							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD() || absoluteError < getTHRESHOLD());
 						format.close();
 					} else {
 						Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
@@ -313,6 +324,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
 			List<String> outStrs) {
 		MLContext gpuMLC = new MLContext(spark);
+		gpuMLC.setConfigProperty("sysml.gpu.dataType", DATA_TYPE);
 		gpuMLC.setGPU(true);
 		gpuMLC.setForceGPU(true);
 		gpuMLC.setStatistics(true);

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index d983716..cbc3563 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -50,9 +50,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void matrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1, 128, 1024 };
-		int[] X2 = { 1, 128, 1024 };
-		int[] Y2 = { 1, 128, 1024 };
+		int[] X1 = { 1, 121 };
+		int[] X2 = { 1, 123 };
+		int[] Y2 = { 1, 122 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -74,8 +74,8 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseMLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 1000000 };
-		int[] X2 = { 1000 };
+		int[] X1 = { 5000 };
+		int[] X2 = { 50 };
 		int[] Y2 = { 1, 20 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
@@ -98,9 +98,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest1() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 100 };
-		int[] X2 = { 600, 900  };
-		int[] Y2 = { 205800 };
+		int[] X1 = { 32 };
+		int[] X2 = { 60, 90  };
+		int[] Y2 = { 2058 };
 		double[] SX = { 0.0, 0.03, 0.3 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
 
@@ -122,9 +122,9 @@ public class MatrixMultiplicationOpTest extends GPUTests {
 	public void commonCaseDLMatrixMatrixTest2() {
 		String scriptStr = "O = X %*% Y";
 
-		int[] X1 = { 64 };
-		int[] X2 = { 196608   };
-		int[] Y2 = { 512 };
+		int[] X1 = { 32 };
+		int[] X2 = { 1966   };
+		int[] Y2 = { 256 };
 		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
 		double[] SY = { 0.0, 0.03, 0.3, 0.9 };

[31/50] [abbrv] systemml git commit: [SYSTEMML-446] Bugfix for GPU sparse right indexing with empty output

Posted by re...@apache.org.

[SYSTEMML-446] Bugfix for GPU sparse right indexing with empty output


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d3917eff
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d3917eff
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d3917eff

Branch: refs/heads/master
Commit: d3917effd988de0e0977a310c73c4f232214632e
Parents: abbffc5
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 19:57:28 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 19:57:28 2017 -0700

----------------------------------------------------------------------
 .../gpu/context/ExecutionConfig.java            | 29 ++------------------
 .../runtime/matrix/data/LibMatrixCUDA.java      |  8 ++++--
 2 files changed, 7 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/d3917eff/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index 7f8eb9e..cae0660 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -69,6 +69,8 @@ public class ExecutionConfig {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) throws DMLRuntimeException {
+		if(numCells == 0)
+			throw new DMLRuntimeException("Attempting to invoke a kernel with 0 threads");
 		int deviceNumber = 0;
 		int blockDimX = getMaxBlockDim(deviceNumber);
 		int gridDimX = (int) Math.ceil((double) numCells / blockDimX);
@@ -76,32 +78,6 @@ public class ExecutionConfig {
 	}
 
 	/**
-	 * Use this for simple matrix operations and use following in the kernel
-	 * <code>
-	 * int ix = blockIdx.x * blockDim.x + threadIdx.x;
-	 * int iy = blockIdx.y * blockDim.y + threadIdx.y;
-	 * </code>
-	 * <p>
-	 * This tries to schedule as minimum grids as possible.
-	 *
-	 * @param rlen number of rows
-	 * @param clen number of columns
-	 * @return execution configuration
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static ExecutionConfig getConfigForMatrixOperations(int rlen, int clen) throws DMLRuntimeException {
-		int deviceNumber = 0;
-		int maxBlockDim = getMaxBlockDim(deviceNumber);
-		int blockDimX = (int) Math.min(maxBlockDim, rlen);
-		int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
-		int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen);
-		int gridDimY = (int) Math.ceil((double) clen / blockDimY);
-		if (gridDimY > 65535)
-			throw new DMLRuntimeException("Internal Error: gridDimY must be less than 65535 for all supported CUDA compute capabilites!");
-		return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
-	}
-
-	/**
 	 * Use this for simple vector operations and use following in the kernel
 	 * <code>
 	 * int index = blockIdx.x * blockDim.x + threadIdx.x
@@ -116,7 +92,6 @@ public class ExecutionConfig {
 		return getConfigForSimpleVectorOperations(rlen * clen);
 	}
 
-
 	public ExecutionConfig(int gridDimX, int blockDimX) {
 		this.gridDimX = gridDimX;
 		this.blockDimX = blockDimX;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d3917eff/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index eb17e69..2cccde0 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1821,17 +1821,19 @@ public class LibMatrixCUDA {
 	 */
 	protected static void sliceSparseDense(GPUContext gCtx, String instName, CSRPointer inPointer, Pointer outPointer, 
 			int rl, int ru, int cl, int cu, int inClen) throws DMLRuntimeException {
+		int size = getNnz(inPointer, rl, ru);
+		// Return since nnz of the output is 0 as outPointer is expected to be zeroed out.
+		if(size == 0) return;
+		
 		int retRlen = ru - rl + 1;
 		long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		int retClen = cu - cl + 1;
 		
-		int size = -1; String kernel = null; String timer = null;
-		
+		String kernel = null; String timer = null;
 		// Note: row-wise parallelization scheme iterates over input rows in single thread 
 		// whereas nnz parallelization scheme iterates over number of output rows in single thread.
 		if(inClen > 10 && retClen > 2*retRlen) {
 			// Perform nnz parallelization for wide and short matrices
-			size = getNnz(inPointer, rl, ru);
 			timer = GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP_NNZ;
 			kernel = "slice_sparse_dense_nnz";
 		}

[37/50] [abbrv] systemml git commit: [SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

Posted by re...@apache.org.

[SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

This patch improves the performance of codegen outer operations
(especially left mm), for ultra-sparse inputs. On ultra-sparse inputs,
the allocation and maintenance of column indexes can become a
bottleneck. Accordingly, this patch adds a special case for ultra-sparse
matrices.

For the core update rules of ALS-CG, this patch improved the performance
as follows (10 iterations over the amazon review dataset):
Left (t(t(U) %*% (W * (U %*% t(V))))): 125.7s -> 14.2s
Right ((W * (U %*% t(V))) %*% V): 25.2s -> 17.9s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ede870de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ede870de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ede870de

Branch: refs/heads/master
Commit: ede870de8a7b01bd44ac3b6bcfe7f0e86b1c93c8
Parents: 83e01b0
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Oct 27 22:01:36 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Oct 27 22:01:36 2017 -0700

----------------------------------------------------------------------
 .../runtime/codegen/SpoofOuterProduct.java      | 77 +++++++++++++-------
 1 file changed, 52 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ede870de/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index ac8dc57..26a661a 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
@@ -391,7 +392,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			c[0] = sum;
 	}
 	
-	private void executeSparse(SparseBlock sblock,  double[] u, double[] v, double[][] b, double[] scalars,
+	private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars,
 		double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) 
 	{
 		boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT);
@@ -401,37 +402,63 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		//blocksize is chosen such that we reuse each  Ui/Vj vector on average 8 times,
 		//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
 		final int blocksizeI = (int) (8L*m*n/nnz);
-		final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
-		int[] curk = new int[Math.min(blocksizeI,ru-rl)];
 		
-		for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+		if( OptimizerUtils.getSparsity(m, n, nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse
 		{
-			int bimin = Math.min(ru, bi+blocksizeI);
-			//prepare starting indexes for block row
-			for( int i=bi; i<bimin; i++ ) {
+			//for ultra-sparse matrices, we do not allocate the index array because
+			//its allocation and maintenance can dominate the total runtime.
+			
+			//core wdivmm block matrix mult
+			for( int i=rl, uix=rl*k; i<ru; i++, uix+=k ) {
+				if( sblock.isEmpty(i) ) continue;
+				
+				int wpos = sblock.pos(i);
+				int wlen = sblock.size(i);
+				int[] wix = sblock.indexes(i);
+				double[] wval = sblock.values(i);
+				
 				int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
-				curk[i-bi] = (index>=0) ? index : n;
+				index = wpos + ((index>=0) ? index : n);
+				for( ; index<wpos+wlen && wix[index]<cu; index++ ) {
+					genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+						(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+				}
 			}
+		}
+		else //sparse
+		{
+			final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
+			int[] curk = new int[Math.min(blocksizeI,ru-rl)];
 			
-			//blocked execution over column blocks
-			for( int bj = cl; bj < cu; bj+=blocksizeJ )
+			for( int bi = rl; bi < ru; bi+=blocksizeI ) 
 			{
-				int bjmin = Math.min(cu, bj+blocksizeJ);
-				//core wdivmm block matrix mult
-				for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
-					if( sblock.isEmpty(i) ) continue;
-					
-					int wpos = sblock.pos(i);
-					int wlen = sblock.size(i);
-					int[] wix = sblock.indexes(i);
-					double[] wval = sblock.values(i);
-					
-					int index = wpos + curk[i-bi];
-					for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
-						genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
-							(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+				int bimin = Math.min(ru, bi+blocksizeI);
+				//prepare starting indexes for block row
+				for( int i=bi; i<bimin; i++ ) {
+					int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
+					curk[i-bi] = (index>=0) ? index : n;
+				}
+				
+				//blocked execution over column blocks
+				for( int bj = cl; bj < cu; bj+=blocksizeJ )
+				{
+					int bjmin = Math.min(cu, bj+blocksizeJ);
+					//core wdivmm block matrix mult
+					for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+						if( sblock.isEmpty(i) ) continue;
+						
+						int wpos = sblock.pos(i);
+						int wlen = sblock.size(i);
+						int[] wix = sblock.indexes(i);
+						double[] wval = sblock.values(i);
+						
+						int index = wpos + curk[i-bi];
+						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+							genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+								(left ? wix[index]*k : uix), m, n, k, i, wix[index]);
+						}
+						curk[i-bi] = index - wpos;
 					}
-					curk[i-bi] = index - wpos;
 				}
 			}
 		}

[02/50] [abbrv] systemml git commit: [HOTFIX][SYSTEMML-1959] Fix sparse-sparse transpose w/ CSR input

Posted by re...@apache.org.

[HOTFIX][SYSTEMML-1959] Fix sparse-sparse transpose w/ CSR input

This patch fixes a remaining issue of sparse-sparse transpose related to
the correct handling of sparse blocks in CSR or COO format.

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/33559144
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/33559144
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/33559144

Branch: refs/heads/master
Commit: 33559144cd707e324b59ed5ca417e3d5461c2f0a
Parents: a347af3
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:42:20 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:42:20 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/LibMatrixReorg.java   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/33559144/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 3ae07c5..dd86c27 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -859,8 +859,8 @@ public class LibMatrixReorg
 			if( cl > 0 ) {
 				for( int i=bi; i<bimin; i++ )
 					if( !a.isEmpty(i) ) {
-						int pos = a.posFIndexGTE(i, cl);
-						ix[i-bi] = (pos>=0) ? pos : a.size(i);
+						int j = a.posFIndexGTE(i, cl);
+						ix[i-bi] = (j>=0) ? j : a.size(i);
 					}
 			}
 			
@@ -868,19 +868,19 @@ public class LibMatrixReorg
 				int bjmin = Math.min(bj+blocksizeJ, cu);
 
 				//core block transpose operation
-				for( int i=bi, iix=0; i<bimin; i++, iix++ ) {
+				for( int i=bi; i<bimin; i++ ) {
 					if( a.isEmpty(i) ) continue;
 					
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
-					int j = ix[iix]; //last block boundary
-					for( ; j<alen && aix[j]<bjmin; j++ ) {
-						c.allocate(aix[apos+j], ennz2,n2);
-						c.append(aix[apos+j], i, avals[apos+j]);
+					int j = ix[i-bi] + apos; //last block boundary
+					for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
+						c.allocate(aix[j], ennz2,n2);
+						c.append(aix[j], i, avals[j]);
 					}
-					ix[iix] = j; //keep block boundary
+					ix[i-bi] = j - apos; //keep block boundary
 				}
 			}
 		}

[23/50] [abbrv] systemml git commit: [SYSTEMML-1972] Fix rewrite remove right indexing (w/ invalid ix range)

Posted by re...@apache.org.

[SYSTEMML-1972] Fix rewrite remove right indexing (w/ invalid ix range)

This patch hardens the existing rewrite of removing unnecessary right
indexing operations whose input and output are of equal size, which is
only valid with valid indexing ranges. Although we check this during
validation, there are scenarios with unknown sizes or index expressions
that cause invalid results despite invalid index ranges. We now simply
check for valid row-lower and column-lower indexing ranges which both
needs to be 1 for the rewrite to be valid.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a472ae92
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a472ae92
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a472ae92

Branch: refs/heads/master
Commit: a472ae922827b437e00ca8331ff3db5f6c19f443
Parents: 2c37d9f
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Oct 23 23:43:46 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 23 23:44:04 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/IndexingOp.java  |  5 +----
 .../sysml/hops/rewrite/HopRewriteUtils.java     | 13 +++++++++++
 .../RewriteAlgebraicSimplificationDynamic.java  | 23 ++++++--------------
 3 files changed, 21 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a472ae92/src/main/java/org/apache/sysml/hops/IndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java
index 23d0630..5989c66 100644
--- a/src/main/java/org/apache/sysml/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java
@@ -118,10 +118,7 @@ public class IndexingOp extends Hop
 		Hop input = getInput().get(0);
 		
 		//rewrite remove unnecessary right indexing
-		if( dimsKnown() && input.dimsKnown() 
-			&& getDim1() == input.getDim1() && getDim2() == input.getDim2()
-			&& !(getDim1()==1 && getDim2()==1))
-		{
+		if( HopRewriteUtils.isUnnecessaryRightIndexing(this) ) {
 			setLops( input.constructLops() );
 		}
 		//actual lop construction, incl operator selection 

http://git-wip-us.apache.org/repos/asf/systemml/blob/a472ae92/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index 68068eb..ad2392a 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -1000,6 +1000,19 @@ public class HopRewriteUtils
 			&& hop.getInput().get(4) instanceof LiteralOp;
 	}
 	
+	public static boolean isUnnecessaryRightIndexing(Hop hop) {
+		if( !(hop instanceof IndexingOp) )
+			return false;
+		//note: in addition to equal sizes, we also check a valid
+		//starting row and column ranges of 1 in order to guard against
+		//invalid modifications in the presence of invalid index ranges
+		//(e.g., X[,2] on a column vector needs to throw an error)
+		return isEqualSize(hop, hop.getInput().get(0))
+			&& !(hop.getDim1()==1 && hop.getDim2()==1)
+			&& isLiteralOfValue(hop.getInput().get(1), 1)  //rl
+			&& isLiteralOfValue(hop.getInput().get(3), 1); //cl
+	}
+	
 	public static boolean isScalarMatrixBinaryMult( Hop hop ) {
 		return hop instanceof BinaryOp && ((BinaryOp)hop).getOp()==OpOp2.MULT
 			&& ((hop.getInput().get(0).getDataType()==DataType.SCALAR && hop.getInput().get(1).getDataType()==DataType.MATRIX)

http://git-wip-us.apache.org/repos/asf/systemml/blob/a472ae92/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index 5437535..eba06fc 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -230,23 +230,14 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 	
 	private static Hop removeUnnecessaryRightIndexing(Hop parent, Hop hi, int pos)
 	{
-		if( hi instanceof IndexingOp ) //indexing op
-		{
+		if( HopRewriteUtils.isUnnecessaryRightIndexing(hi) ) {
+			//remove unnecessary right indexing
 			Hop input = hi.getInput().get(0);
-			if( HopRewriteUtils.isEqualSize(hi, input)     //equal dims
-				&& !(hi.getDim1()==1 && hi.getDim2()==1) ) //not 1-1 matrix/frame	
-			{
-				//equal dims of right indexing input and output -> no need for indexing
-				//(not applied for 1-1 matrices because low potential and issues w/ error
-				//handling if out of range indexing)
-				
-				//remove unnecessary right indexing
-				HopRewriteUtils.replaceChildReference(parent, hi, input, pos);
-				HopRewriteUtils.cleanupUnreferenced(hi);
-				hi = input;
-				
-				LOG.debug("Applied removeUnnecessaryRightIndexing");
-			}			
+			HopRewriteUtils.replaceChildReference(parent, hi, input, pos);
+			HopRewriteUtils.cleanupUnreferenced(hi);
+			hi = input;
+			
+			LOG.debug("Applied removeUnnecessaryRightIndexing");
 		}
 		
 		return hi;

[21/50] [abbrv] systemml git commit: [SYSTEMML-1648] Making SVM scripts work with MLContext

Posted by re...@apache.org.

[SYSTEMML-1648] Making SVM scripts work with MLContext

Closes #687.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/596005a8
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/596005a8
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/596005a8

Branch: refs/heads/master
Commit: 596005a80d0b39fef9b33b55145ffda043a4573d
Parents: a51f8e8
Author: j143 <j1...@protonmail.com>
Authored: Sun Oct 22 21:35:08 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 21:35:08 2017 -0700

----------------------------------------------------------------------
 scripts/algorithms/l2-svm-predict.dml |  82 ++++++-------
 scripts/algorithms/l2-svm.dml         | 118 +++++++++---------
 scripts/algorithms/m-svm-predict.dml  |  45 +++----
 scripts/algorithms/m-svm.dml          | 186 ++++++++++++++++-------------
 4 files changed, 222 insertions(+), 209 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/596005a8/scripts/algorithms/l2-svm-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/l2-svm-predict.dml b/scripts/algorithms/l2-svm-predict.dml
index 9052265..73e28b4 100644
--- a/scripts/algorithms/l2-svm-predict.dml
+++ b/scripts/algorithms/l2-svm-predict.dml
@@ -51,6 +51,7 @@ cmdLine_Y = ifdef($Y, " ")
 cmdLine_confusion = ifdef($confusion, " ")
 cmdLine_accuracy = ifdef($accuracy, " ")
 cmdLine_scores = ifdef($scores, " ")
+cmdLine_scoring_only = ifdef($scoring_only, FALSE)
 cmdLine_fmt = ifdef($fmt, "text")
 
 X = read($X)
@@ -59,7 +60,7 @@ w = read($model)
 
 dimensions = as.scalar(w[nrow(w),1])
 if(dimensions != ncol(X))
-	stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
+  stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
 	
 intercept = as.scalar(w[nrow(w)-1,1])
 negative_label = as.scalar(w[nrow(w)-2,1])
@@ -68,56 +69,51 @@ w = w[1:(nrow(w)-4),]
 
 b = 0.0
 if(intercept == 1)
-	b = as.scalar(w[nrow(w),1])
+  b = as.scalar(w[nrow(w),1])
 
 scores = b + (X %*% w[1:ncol(X),])
 
 if(cmdLine_scores != " ")
-	write(scores, cmdLine_scores, format=cmdLine_fmt)
+  write(scores, cmdLine_scores, format=cmdLine_fmt)
 
-if(cmdLine_Y != " "){
-	y = read(cmdLine_Y)
+if(!cmdLine_scoring_only){
+  Y = read(cmdLine_Y)
 
-	pred = (scores >= 0)
-	pred_labels = pred*positive_label + (1-pred)*negative_label
-	num_correct = sum(pred_labels == y)
-	acc = 100*num_correct/nrow(X)
+  pred = (scores >= 0)
+  pred_labels = pred*positive_label + (1-pred)*negative_label
+  num_correct = sum(pred_labels == Y)
+  acc = 100*num_correct/nrow(X)
 
-	acc_str = "Accuracy (%): " + acc
-	print(acc_str)
-	if(cmdLine_accuracy != " ")
-		write(acc_str, cmdLine_accuracy)
+  acc_str = "Accuracy (%): " + acc
+  print(acc_str)
+  
+  if(cmdLine_accuracy != " ")
+    write(acc_str, cmdLine_accuracy)
 
-	if(cmdLine_confusion != " "){
-		pred = 2*pred - 1
-		if(negative_label != -1 | positive_label != +1)
-        	y = 2/(positive_label - negative_label)*y - (negative_label + positive_label)/(positive_label - negative_label)
-		
-		pred_is_minus = (pred == -1)
-		pred_is_plus = 1 - pred_is_minus
-		y_is_minus = (y == -1)
-		y_is_plus = 1 - y_is_minus
-
-		check_min_y_minus = sum(pred_is_minus*y_is_minus)
-		check_min_y_plus = sum(pred_is_minus*y_is_plus)
-		check_max_y_minus = sum(pred_is_plus*y_is_minus)
-		check_max_y_plus = sum(pred_is_plus*y_is_plus)
+  if(cmdLine_confusion != " "){
+  
+    pred = 2*pred - 1
+    
+    if(negative_label != -1 | positive_label != +1)
+      Y = 2/(positive_label - negative_label)*Y - (negative_label + positive_label)/(positive_label - negative_label)
+    		
+    pred_is_minus = (pred == -1)
+    pred_is_plus = 1 - pred_is_minus
+    y_is_minus = (Y == -1)
+    y_is_plus = 1 - y_is_minus
 
-		#s = check_min_y_minus + "," + check_min_y_plus
-		#s = append(s, check_max_y_minus + "," + check_max_y_plus)
-		#s = append(s, "")
-		#write(s, cmdLine_confusion)
+    check_min_y_minus = sum(pred_is_minus*y_is_minus)
+    check_min_y_plus = sum(pred_is_minus*y_is_plus)
+    check_max_y_minus = sum(pred_is_plus*y_is_minus)
+    check_max_y_plus = sum(pred_is_plus*y_is_plus)
 		
-		confusion_mat = matrix(0, rows=3, cols=3)
-        confusion_mat[1,2] = negative_label
-        confusion_mat[1,3] = positive_label
-        confusion_mat[2,1] = negative_label
-        confusion_mat[3,1] = positive_label
-        confusion_mat[2,2] = check_min_y_minus
-        confusion_mat[2,3] = check_max_y_minus
-        confusion_mat[3,2] = check_min_y_plus
-        confusion_mat[3,3] = check_max_y_plus
-
-        write(confusion_mat, cmdLine_confusion, format="csv")
-	}
+    confusion_mat = matrix(0, rows=2, cols=2)
+    confusion_mat[1,1] = check_min_y_minus
+    confusion_mat[1,2] = check_min_y_plus
+    confusion_mat[2,1] = check_max_y_minus
+    confusion_mat[2,2] = check_max_y_plus
+	
+    write(confusion_mat, cmdLine_confusion, format="csv")
+  }
 }
+

http://git-wip-us.apache.org/repos/asf/systemml/blob/596005a8/scripts/algorithms/l2-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index d5c7e02..141ef82 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -61,7 +61,7 @@ X = read($X)
 Y = read($Y)
 
 if(nrow(X) < 2)
-	stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows")
+  stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows")
 
 check_min = min(Y)
 check_max = max(Y)
@@ -69,46 +69,44 @@ num_min = sum(Y == check_min)
 num_max = sum(Y == check_max)
 
 if(check_min == check_max)
-	stop("Stopping due to invalid inputs: Y seems to contain exactly one label")
+  stop("Stopping due to invalid inputs: Y seems to contain exactly one label")
 
 if(num_min + num_max != nrow(Y))
-	stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels")
+  stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels")
 	
 if(check_min != -1 | check_max != +1) 
-	Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
+  Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
 
 positive_label = check_max
 negative_label = check_min
 
-continue = 1
-
 intercept = cmdLine_icpt
 if(intercept != 0 & intercept != 1)
-	stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
+  stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
 
 epsilon = cmdLine_tol
 if(epsilon < 0)
-	stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
+  stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
 	
 lambda = cmdLine_reg
 if(lambda < 0)
-	stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
+  stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
 	
 maxiterations = cmdLine_maxiter
 if(maxiterations < 1)
-	stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
+  stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
 
 num_samples = nrow(X)
 dimensions = ncol(X)
 
 if (intercept == 1) {
-	ones  = matrix(1, rows=num_samples, cols=1)
-	X = cbind(X, ones);
+  ones  = matrix(1, rows=num_samples, cols=1)
+  X = cbind(X, ones);
 }
 
 num_rows_in_w = dimensions
 if(intercept == 1){
-	num_rows_in_w = num_rows_in_w + 1
+  num_rows_in_w = num_rows_in_w + 1
 }
 w = matrix(0, rows=num_rows_in_w, cols=1)
 
@@ -118,54 +116,49 @@ s = g_old
 Xw = matrix(0, rows=nrow(X), cols=1)
 debug_str = "# Iter, Obj"
 iter = 0
-while(continue == 1 & iter < maxiterations)  {
-	# minimizing primal obj along direction s
-    step_sz = 0
-    Xd = X %*% s
-    wd = lambda * sum(w * s)
-    dd = lambda * sum(s * s)
-    continue1 = 1
-    while(continue1 == 1){
-		tmp_Xw = Xw + step_sz*Xd
-      	out = 1 - Y * (tmp_Xw)
-      	sv = (out > 0)
-      	out = out * sv
-      	g = wd + step_sz*dd - sum(out * Y * Xd)
-      	h = dd + sum(Xd * sv * Xd)
-      	step_sz = step_sz - g/h
-      	if (g*g/h < 0.0000000001){
-        	continue1 = 0
-      	}
-    }
-
-    #update weights
-    w = w + step_sz*s
-	Xw = Xw + step_sz*Xd
-	
-    out = 1 - Y * Xw
+continue = TRUE
+while(continue & iter < maxiterations)  {
+  # minimizing primal obj along direction s
+  step_sz = 0
+  Xd = X %*% s
+  wd = lambda * sum(w * s)
+  dd = lambda * sum(s * s)
+  
+  continue1 = TRUE
+  while(continue1){
+    tmp_Xw = Xw + step_sz*Xd
+    out = 1 - Y * (tmp_Xw)
     sv = (out > 0)
-    out = sv * out
-    obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
-    g_new = t(X) %*% (out * Y) - lambda * w
-
-    print("ITER " + iter + ": OBJ=" + obj)
-	debug_str = append(debug_str, iter + "," + obj)
+    out = out * sv
+    g = wd + step_sz*dd - sum(out * Y * Xd)
+    h = dd + sum(Xd * sv * Xd)
+    step_sz = step_sz - g/h
+    
+    continue1 = (gg/h >= 0.0000000001);
+  }
+
+  #update weights
+  w = w + step_sz*s
+  Xw = Xw + step_sz*Xd
 	
-    tmp = sum(s * g_old)
-    if(step_sz*tmp < epsilon*obj){
-    	continue = 0
-    }
-
-    #non-linear CG step
-    be = sum(g_new * g_new)/sum(g_old * g_old)
-    s = be * s + g_new
-    g_old = g_new
-
-	if(sum(s^2) == 0){
-	    continue = 0
-	}
-
-    iter = iter + 1
+  out = 1 - Y * Xw
+  sv = (out > 0)
+  out = sv * out
+  obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
+  g_new = t(X) %*% (out * Y) - lambda * w
+
+  print("ITER " + iter + ": OBJ=" + obj)
+  debug_str = append(debug_str, iter + "," + obj)
+	
+  tmp = sum(s * g_old)
+  continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
+
+  #non-linear CG step
+  be = sum(g_new * g_new)/sum(g_old * g_old)
+  s = be * s + g_new
+  g_old = g_new
+    
+  iter = iter + 1
 }
 
 extra_model_params = matrix(0, rows=4, cols=1)
@@ -174,10 +167,13 @@ extra_model_params[2,1] = negative_label
 extra_model_params[3,1] = intercept
 extra_model_params[4,1] = dimensions
 
+weights = w
 w = t(cbind(t(w), t(extra_model_params)))
 write(w, $model, format=cmdLine_fmt)
+# write(extra_model_params, " ", format=cmdLine_fmt)
+# write(weights, " ", format=cmdLine_fmt)
 
 logFile = $Log
 if(logFile != " ") {
-	write(debug_str, logFile)
-}
\ No newline at end of file
+  write(debug_str, logFile)
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/596005a8/scripts/algorithms/m-svm-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm-predict.dml b/scripts/algorithms/m-svm-predict.dml
index a959836..8ad8bf0 100644
--- a/scripts/algorithms/m-svm-predict.dml
+++ b/scripts/algorithms/m-svm-predict.dml
@@ -26,13 +26,14 @@
 # accuracy (%) for the predictions
 #
 # Example Usage:
-# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
+# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels scoring_only=FALSE model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
 #													 
 
 cmdLine_Y = ifdef($Y, " ")
 cmdLine_confusion = ifdef($confusion, " ")
 cmdLine_accuracy = ifdef($accuracy, " ")
 cmdLine_scores = ifdef($scores, " ")
+cmdLine_scoring_only = ifdef($scoring_only, FALSE)
 cmdLine_fmt = ifdef($fmt, "text")
 
 X = read($X);
@@ -40,7 +41,7 @@ W = read($model);
 
 dimensions = as.scalar(W[nrow(W),1])
 if(dimensions != ncol(X))
-	stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
+  stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
 
 intercept = as.scalar(W[nrow(W)-1,1])
 W = W[1:(nrow(W)-2),]
@@ -51,34 +52,34 @@ m=ncol(X);
 
 b = matrix(0, rows=1, cols=num_classes)
 if (intercept == 1)
-	b = W[m+1,]
+  b = W[m+1,]
 
 ones = matrix(1, rows=N, cols=1)
 scores = X %*% W[1:m,] + ones %*% b;
 	
 if(cmdLine_scores != " ")
-	write(scores, cmdLine_scores, format=cmdLine_fmt);
+  write(scores, cmdLine_scores, format=cmdLine_fmt);
 
-if(cmdLine_Y != " "){
-	y = read(cmdLine_Y);
+if(!cmdLine_scoring_only){
+  Y = read(cmdLine_Y);
 	
-	if(min(y) < 1)
-		stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+  if(min(Y) < 1)
+    stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
 	
-	pred = rowIndexMax(scores);
-	correct_percentage = sum((pred - y) == 0) / N * 100;
-	
-	acc_str = "Accuracy (%): " + correct_percentage
-	print(acc_str)
-	if(cmdLine_accuracy != " ")
-		write(acc_str, cmdLine_accuracy)
+  pred = rowIndexMax(scores);
+  correct_percentage = sum((pred - Y) == 0) / N * 100;
+  
+  acc_str = "Accuracy (%): " + correct_percentage
+  print(acc_str)
+  if(cmdLine_accuracy != " ")
+    write(acc_str, cmdLine_accuracy)
 
-	num_classes_ground_truth = max(y)
-	if(num_classes < num_classes_ground_truth)
-		num_classes = num_classes_ground_truth
+  num_classes_ground_truth = max(Y)
+  if(num_classes < num_classes_ground_truth)
+    num_classes = num_classes_ground_truth
 
-	if(cmdLine_confusion != " "){
-		confusion_mat = table(y, pred, num_classes, num_classes)
-		write(confusion_mat, cmdLine_confusion, format="csv")
-	}
+  if(cmdLine_confusion != " "){
+    confusion_mat = table(Y, pred, num_classes, num_classes)
+    write(confusion_mat, cmdLine_confusion, format="csv")
+  }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/596005a8/scripts/algorithms/m-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml
index 8d3d5f3..6c11811 100644
--- a/scripts/algorithms/m-svm.dml
+++ b/scripts/algorithms/m-svm.dml
@@ -26,6 +26,23 @@
 # Assume SVM_HOME is set to the home of the dml script
 # Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
 # Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME      TYPE    DEFAULT     MEANING
+# ---------------------------------------------------------------------------------------------
+# X         String  ---         Location to read the matrix X of feature vectors
+# Y         String  ---         Location to read response matrix Y
+# icpt      Int     0           Intercept presence
+#                               0 = no intercept
+#                               1 = add intercept;
+# tol       Double  0.001       Tolerance (epsilon);
+# reg       Double  1.0         Regularization parameter
+# maxiter   Int     100         Maximum number of conjugate gradient iterations
+# model     String  ---         Location to write model
+# fmt       String  "text"      The output format of the output, such as "text" or "csv"
+# Log       String  ---         [OPTIONAL] Location to write the log file
+# ---------------------------------------------------------------------------------------------
 # 
 # hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
 #
@@ -41,141 +58,144 @@ print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + "
 X = read($X)
 
 if(nrow(X) < 2)
-	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
+  stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
 
 dimensions = ncol(X)
 
 Y = read($Y)
 
 if(nrow(X) != nrow(Y))
-	stop("Stopping due to invalid argument: Numbers of rows in X and Y must match")
+  stop("Stopping due to invalid argument: Numbers of rows in X and Y must match")
 
 intercept = cmdLine_icpt
 if(intercept != 0 & intercept != 1)
-	stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
+  stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
 
 min_y = min(Y)
 if(min_y < 1)
-	stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+  stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+  
 num_classes = max(Y)
 if(num_classes == 1)
-	stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")	
+  stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")
+  
 mod1 = Y %% 1
 mod1_should_be_nrow = sum(abs(mod1 == 0))
 if(mod1_should_be_nrow != nrow(Y))
-	stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
+  stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
 	
 epsilon = cmdLine_tol
 if(epsilon < 0)
-	stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
+  stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
 
 lambda = cmdLine_reg
 if(lambda < 0)
-	stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
+  stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
 
 max_iterations = cmdLine_maxiter
 if(max_iterations < 1)
-	stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
+  stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
 
 num_samples = nrow(X)
 num_features = ncol(X)
 
 if (intercept == 1) {
-	ones  = matrix(1, rows=num_samples, cols=1);
-	X = cbind(X, ones);
+  ones  = matrix(1, rows=num_samples, cols=1);
+  X = cbind(X, ones);
 }
 
 num_rows_in_w = num_features
 if(intercept == 1){
-	num_rows_in_w = num_rows_in_w + 1
+  num_rows_in_w = num_rows_in_w + 1
 }
 w = matrix(0, rows=num_rows_in_w, cols=num_classes)
 
 debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+
 parfor(iter_class in 1:num_classes){		  
-	Y_local = 2 * (Y == iter_class) - 1
-	w_class = matrix(0, rows=num_features, cols=1)
-	if (intercept == 1) {
-		zero_matrix = matrix(0, rows=1, cols=1);
-		w_class = t(cbind(t(w_class), zero_matrix));
-	}
- 
-	g_old = t(X) %*% Y_local
-	s = g_old
-
-	Xw = matrix(0, rows=nrow(X), cols=1)
-	iter = 0
-	continue = 1
-	while(continue == 1)  {
-		# minimizing primal obj along direction s
- 		step_sz = 0
- 		Xd = X %*% s
- 		wd = lambda * sum(w_class * s)
-		dd = lambda * sum(s * s)
-		continue1 = 1
-		while(continue1 == 1){
- 			tmp_Xw = Xw + step_sz*Xd
- 			out = 1 - Y_local * (tmp_Xw)
- 			sv = (out > 0)
- 			out = out * sv
- 			g = wd + step_sz*dd - sum(out * Y_local * Xd)
- 			h = dd + sum(Xd * sv * Xd)
- 			step_sz = step_sz - g/h
- 			if (g*g/h < 0.0000000001){
-			continue1 = 0
-		}
-	}
- 
-		#update weights
-		w_class = w_class + step_sz*s
-		Xw = Xw + step_sz*Xd
+  Y_local = 2 * (Y == iter_class) - 1
+  w_class = matrix(0, rows=num_features, cols=1)
+  
+  if (intercept == 1) {
+    zero_matrix = matrix(0, rows=1, cols=1);
+    w_class = t(cbind(t(w_class), zero_matrix));
+  }
+  
+  g_old = t(X) %*% Y_local
+  s = g_old
+
+  Xw = matrix(0, rows=nrow(X), cols=1)
+  iter = 0
+  continue = TRUE
+  while(continue & iter < maxiterations)  {
+    # minimizing primal obj along direction s
+    step_sz = 0
+    Xd = X %*% s
+    wd = lambda * sum(w_class * s)
+    dd = lambda * sum(s * s)
+    
+    continue1 = TRUE
+    while(continue1){
+      tmp_Xw = Xw + step_sz*Xd
+      out = 1 - Y_local * (tmp_Xw)
+      sv = (out > 0)
+      out = out * sv
+      g = wd + step_sz*dd - sum(out * Y_local * Xd)
+      h = dd + sum(Xd * sv * Xd)
+      step_sz = step_sz - g/h
+      
+      continue1 = (g*g/h >= 0.0000000001)
+      
+    }
+    
+    #update weights
+    w_class = w_class + step_sz*s
+    Xw = Xw + step_sz*Xd
  
-		out = 1 - Y_local * Xw
-		sv = (out > 0)
-		out = sv * out
-		obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
-  		g_new = t(X) %*% (out * Y_local) - lambda * w_class
+    out = 1 - Y_local * Xw
+    sv = (out > 0)
+    out = sv * out
+    obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+    g_new = t(X) %*% (out * Y_local) - lambda * w_class
 
-  		tmp = sum(s * g_old)
+    tmp = sum(s * g_old)
   
-  		train_acc = sum(Y_local*(X%*%w_class) >= 0)/num_samples*100
-  		print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
-  		debug_mat[iter+1,iter_class] = obj	   
+    train_acc = sum(Y_local*(X%*%w_class) >= 0)/num_samples*100
+    print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
+    debug_mat[iter+1,iter_class] = obj	   
    
-  		if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
-   			continue = 0
-  		}
- 
-  		#non-linear CG step
-  		be = sum(g_new * g_new)/sum(g_old * g_old)
-  		s = be * s + g_new
-  		g_old = g_new
-
-		if(sum(s^2) == 0){
-	    	continue = 0
-		}
-
-  		iter = iter + 1
- 	}
-
-	w[,iter_class] = w_class
-}
+    continue = (step_sz*tmp >= epsilon*obj & sum(s^2) != 0);
+  	 
+    #non-linear CG step
+    be = sum(g_new * g_new)/sum(g_old * g_old)
+    s = be * s + g_new
+    g_old = g_new
+    
+    iter = iter + 1
+  }
+
+  w[,iter_class] = w_class
+} # parfor loop
 
 extra_model_params = matrix(0, rows=2, cols=ncol(w))
 extra_model_params[1, 1] = intercept
 extra_model_params[2, 1] = dimensions
+weights = w
 w = t(cbind(t(w), t(extra_model_params)))
 write(w, $model, format=cmdLine_fmt)
+# write(extra_model_params, " ", format=cmdLine_fmt)
+# write(weights, " ", format=cmdLine_fmt)
 
 debug_str = "# Class, Iter, Obj"
 for(iter_class in 1:ncol(debug_mat)){
-	for(iter in 1:nrow(debug_mat)){
-		obj = as.scalar(debug_mat[iter, iter_class])
-		if(obj != -1) 
-			debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
-	}
+  for(iter in 1:nrow(debug_mat)){
+    obj = as.scalar(debug_mat[iter, iter_class])
+    if(obj != -1) 
+      debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
+  }
 }
+
 logFile = $Log
-if(logFile != " ") {
-	write(debug_str, logFile)
-}
\ No newline at end of file
+if(logFile != " ")
+  write(debug_str, logFile)
+

[06/50] [abbrv] systemml git commit: [SYSTEMML-1964] Extended codegen outer template and rework close types

Posted by re...@apache.org.

[SYSTEMML-1964] Extended codegen outer template and rework close types

This patch makes a major change to the codegen outer template OFMC
conditions in order to increase its applicability, which is crucial for
sparsity exploitation in algorithms such as ALS-CG. In order to
guarantee correctness, this patch also cleans up the close types used
during candidate exploration and consolidates the redundant evaluation
of valid entry points during candidate selection.

Furthermore, this patch also improves the code generation of sparse
binary nodes and outer templates with neq 0 on the main input.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2ca2d8aa
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2ca2d8aa
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2ca2d8aa

Branch: refs/heads/master
Commit: 2ca2d8aa73c4c0463a52d7f299320fc9b3865aea
Parents: 586f822
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Oct 16 13:44:40 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 16 15:38:58 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/OptimizerUtils.java   |  5 ++-
 .../sysml/hops/codegen/SpoofCompiler.java       | 41 ++++++++++++++++++-
 .../sysml/hops/codegen/cplan/CNodeBinary.java   | 12 +++---
 .../sysml/hops/codegen/opt/PlanAnalyzer.java    |  3 +-
 .../sysml/hops/codegen/opt/PlanSelection.java   | 19 +--------
 .../codegen/opt/PlanSelectionFuseCostBased.java |  2 +-
 .../opt/PlanSelectionFuseCostBasedV2.java       |  2 +-
 .../opt/PlanSelectionFuseNoRedundancy.java      |  2 +-
 .../hops/codegen/template/CPlanMemoTable.java   | 42 +++++++++++++-------
 .../hops/codegen/template/TemplateBase.java     | 31 ++++++++++-----
 .../hops/codegen/template/TemplateCell.java     | 11 +++--
 .../hops/codegen/template/TemplateMultiAgg.java |  6 +--
 .../codegen/template/TemplateOuterProduct.java  | 20 ++++++----
 .../hops/codegen/template/TemplateRow.java      |  8 ++--
 .../hops/codegen/template/TemplateUtils.java    | 23 ++++++-----
 .../sysml/hops/rewrite/HopRewriteUtils.java     | 12 +++++-
 .../functions/codegen/MiscPatternTest.java      | 41 +++++++++++++++++--
 .../scripts/functions/codegen/miscPattern3.R    | 34 ++++++++++++++++
 .../scripts/functions/codegen/miscPattern3.dml  | 34 ++++++++++++++++
 .../scripts/functions/codegen/miscPattern4.R    | 35 ++++++++++++++++
 .../scripts/functions/codegen/miscPattern4.dml  | 35 ++++++++++++++++
 21 files changed, 328 insertions(+), 90 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index e44e439..5d831e5 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -1017,7 +1017,10 @@ public class OptimizerUtils
 				||(op==OpOp2.LESS     && val==0)
 				||(op==OpOp2.NOTEQUAL && val==0)
 				||(op==OpOp2.EQUAL    && val!=0)
-				||(op==OpOp2.MINUS    && val==0));
+				||(op==OpOp2.MINUS    && val==0)
+				||(op==OpOp2.PLUS     && val==0)
+				||(op==OpOp2.MAX      && val<=0)
+				||(op==OpOp2.MIN      && val>=0));
 	}
 	
 	public static double getBinaryOpSparsityConditionalSparseSafe( double sp1, OpOp2 op, LiteralOp lit ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 0e5e194..5ff90fb 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -39,6 +39,7 @@ import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysml.hops.codegen.cplan.CNodeCell;
 import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg;
@@ -52,6 +53,7 @@ import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseCostBased;
 import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseCostBasedV2;
 import org.apache.sysml.hops.codegen.opt.PlanSelectionFuseNoRedundancy;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.TemplateBase;
 import org.apache.sysml.hops.codegen.template.TemplateBase.CloseType;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
@@ -66,6 +68,7 @@ import org.apache.sysml.hops.AggUnaryOp;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.OpOp1;
 import org.apache.sysml.hops.HopsException;
+import org.apache.sysml.hops.LiteralOp;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.rewrite.ProgramRewriteStatus;
@@ -535,8 +538,7 @@ public class SpoofCompiler
 				CloseType ccode = tpl.close(hop);
 				if( ccode == CloseType.CLOSED_INVALID )
 					iter.remove();
-				else if( ccode == CloseType.CLOSED_VALID )
-					me.closed = true;
+				me.ctype = ccode;
 			}
 		}
 		
@@ -721,6 +723,10 @@ public class SpoofCompiler
 			else
 				rFindAndRemoveLookup(tpl.getOutput(), in1, !(tpl instanceof CNodeRow));
 			
+			//remove unnecessary neq 0 on main input of outer template
+			if( tpl instanceof CNodeOuterProduct )
+				rFindAndRemoveBinaryMS(tpl.getOutput(), in1, BinType.NOTEQUAL, "0", "1");
+			
 			//remove invalid row templates (e.g., unsatisfied blocksize constraint)
 			if( tpl instanceof CNodeRow ) {
 				//check for invalid row cplan over column vector
@@ -800,6 +806,37 @@ public class SpoofCompiler
 		}
 	}
 	
+	@SuppressWarnings("unused")
+	private static void rFindAndRemoveUnary(CNode node, CNodeData mainInput, UnaryType type) {
+		for( int i=0; i<node.getInput().size(); i++ ) {
+			CNode tmp = node.getInput().get(i);
+			if( TemplateUtils.isUnary(tmp, type) && tmp.getInput().get(0) instanceof CNodeData
+				&& ((CNodeData)tmp.getInput().get(0)).getHopID()==mainInput.getHopID() )
+			{
+				node.getInput().set(i, tmp.getInput().get(0));
+			}
+			else
+				rFindAndRemoveUnary(tmp, mainInput, type);
+		}
+	}
+	
+	private static void rFindAndRemoveBinaryMS(CNode node, CNodeData mainInput, BinType type, String lit, String replace) {
+		for( int i=0; i<node.getInput().size(); i++ ) {
+			CNode tmp = node.getInput().get(i);
+			if( TemplateUtils.isBinary(tmp, type) && tmp.getInput().get(1).isLiteral()
+				&& tmp.getInput().get(1).getVarname().equals(lit)
+				&& tmp.getInput().get(0) instanceof CNodeData
+				&& ((CNodeData)tmp.getInput().get(0)).getHopID()==mainInput.getHopID() )
+			{
+				CNodeData cnode = new CNodeData(new LiteralOp(replace));
+				cnode.setLiteral(true);
+				node.getInput().set(i, cnode);
+			}
+			else
+				rFindAndRemoveBinaryMS(tmp, mainInput, type, lit, replace);
+		}
+	}
+	
 	private static boolean rHasLookupRC1(CNode node, CNodeData mainInput, boolean includeRC1) {
 		boolean ret = false;
 		for( int i=0; i<node.getInput().size() && !ret; i++ ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index cac8ab8..d188afd 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -70,13 +70,13 @@ public class CNodeBinary extends CNode
 		
 		public String getTemplate(boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) {
 			switch (this) {
-				case DOT_PRODUCT:   
+				case DOT_PRODUCT:
 					return sparseLhs ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
 									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-				case VECT_MATRIXMULT:   
+				case VECT_MATRIXMULT:
 					return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
 									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
-				case VECT_OUTERMULT_ADD:   
+				case VECT_OUTERMULT_ADD:
 					return sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
 									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
 				
@@ -110,7 +110,7 @@ public class CNodeBinary extends CNode
 				case VECT_PLUS_SCALAR:
 				case VECT_POW_SCALAR:
 				case VECT_MIN_SCALAR:
-				case VECT_MAX_SCALAR:	
+				case VECT_MAX_SCALAR:
 				case VECT_EQUAL_SCALAR:
 				case VECT_NOTEQUAL_SCALAR:
 				case VECT_LESS_SCALAR:
@@ -119,7 +119,7 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_SCALAR: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
+						return sparseRhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
 					else	
 						return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
@@ -274,7 +274,7 @@ public class CNodeBinary extends CNode
 		boolean lsparseLhs = sparse && _inputs.get(0) instanceof CNodeData 
 			&& _inputs.get(0).getVarname().startsWith("a");
 		boolean lsparseRhs = sparse && _inputs.get(1) instanceof CNodeData 
-			&& _inputs.get(1).getVarname().startsWith("a");	
+			&& _inputs.get(1).getVarname().startsWith("a");
 		boolean scalarInput = _inputs.get(0).getDataType().isScalar();
 		boolean scalarVector = (_inputs.get(0).getDataType().isScalar()
 			&& _inputs.get(1).getDataType().isMatrix());

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
index db1ee4d..9910814 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
@@ -266,8 +266,7 @@ public class PlanAnalyzer
 			long[] refs = memo.getAllRefs(hopID);
 			for( int i=0; i<3; i++ ) {
 				if( refs[i] < 0 ) continue;
-				List<TemplateType> tmp = memo.getDistinctTemplateTypes(hopID, i);
-				
+				List<TemplateType> tmp = memo.getDistinctTemplateTypes(hopID, i, true);
 				if( memo.containsNotIn(refs[i], tmp, true, true) )
 					ret.add(new InterestingPoint(DecisionType.TEMPLATE_CHANGE, hopID, refs[i]));
 			}

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelection.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelection.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelection.java
index 369bf75..5242211 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelection.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelection.java
@@ -29,7 +29,6 @@ import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
-import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public abstract class PlanSelection 
@@ -51,22 +50,6 @@ public abstract class PlanSelection
 	 */
 	public abstract void selectPlans(CPlanMemoTable memo, ArrayList<Hop> roots);
 	
-	/**
-	 * Determines if the given partial fusion plan is a valid entry point
-	 * of a fused operator.
-	 * 
-	 * @param me memo table entry
-	 * @param hop current hop
-	 * @return true if entry is valid as top-level plan
-	 */
-	public static boolean isValid(MemoTableEntry me, Hop hop) {
-		return (me.type == TemplateType.CELL)
-			|| (me.type == TemplateType.MAGG)
-			|| (me.type == TemplateType.ROW && !HopRewriteUtils.isTransposeOperation(hop))
-			|| (me.type == TemplateType.OUTER 
-				&& (me.closed || HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)));
-	}
-	
 	protected void addBestPlan(long hopID, MemoTableEntry me) {
 		if( me == null ) return;
 		if( !_bestPlans.containsKey(hopID) )
@@ -108,7 +91,7 @@ public abstract class PlanSelection
 		if( memo.contains(current.getHopID()) ) {
 			if( currentType == null ) {
 				best = memo.get(current.getHopID()).stream()
-					.filter(p -> isValid(p, current))
+					.filter(p -> p.isValid())
 					.min(BASE_COMPARE).orElse(null);
 			}
 			else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
index d01ffe2..521ef61 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBased.java
@@ -559,7 +559,7 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		if( memo.contains(current.getHopID()) ) {
 			if( currentType == null ) {
 				best = memo.get(current.getHopID()).stream()
-					.filter(p -> isValid(p, current))
+					.filter(p -> p.isValid())
 					.filter(p -> hasNoRefToMaterialization(p, M, plan))
 					.min(new BasicPlanComparator()).orElse(null);
 				opened = true;

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index 4b214d0..d2ed3ac 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -773,7 +773,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			//use streams, lambda expressions, etc to avoid unnecessary overhead
 			if( currentType == null ) {
 				for( MemoTableEntry me : memo.get(currentHopId) )
-					best = isValid(me, current) 
+					best = me.isValid() 
 						&& hasNoRefToMatPoint(currentHopId, me, matPoints, plan)
 						&& BasicPlanComparator.icompare(me, best)<0 ? me : best;
 				opened = true;

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseNoRedundancy.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseNoRedundancy.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseNoRedundancy.java
index 2fc90d7..fe3789f 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseNoRedundancy.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseNoRedundancy.java
@@ -85,7 +85,7 @@ public class PlanSelectionFuseNoRedundancy extends PlanSelection
 		if( memo.contains(current.getHopID()) ) {
 			if( currentType == null ) {
 				best = memo.get(current.getHopID()).stream()
-					.filter(p -> isValid(p, current))
+					.filter(p -> p.isValid())
 					.min(new BasicPlanComparator()).orElse(null);
 			}
 			else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index 30672f3..882cde2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -39,6 +39,7 @@ import org.apache.sysml.hops.IndexingOp;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.hops.codegen.opt.InterestingPoint;
 import org.apache.sysml.hops.codegen.opt.PlanSelection;
+import org.apache.sysml.hops.codegen.template.TemplateBase.CloseType;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
@@ -91,7 +92,7 @@ public class CPlanMemoTable
 			return contains(hopID, type[0]);
 		Set<TemplateType> probe = UtilFunctions.asSet(type);
 		return contains(hopID) && get(hopID).stream()
-			.anyMatch(p -> (!checkClose||!p.closed) && probe.contains(p.type));
+			.anyMatch(p -> (!checkClose||!p.isClosed()) && probe.contains(p.type));
 	}
 	
 	public boolean containsNotIn(long hopID, Collection<TemplateType> types, 
@@ -99,7 +100,7 @@ public class CPlanMemoTable
 		return contains(hopID) && get(hopID).stream()
 			.anyMatch(p -> (!checkChildRefs || p.hasPlanRef()) 
 				&& (!excludeCell || p.type!=TemplateType.CELL)
-				&& !types.contains(p.type));
+				&& p.isValid() && !types.contains(p.type));
 	}
 	
 	public int countEntries(long hopID) {
@@ -176,7 +177,7 @@ public class CPlanMemoTable
 		setDistinct(hopID, _plans.get(hopID));
 		
 		//prune closed templates without group references
-		_plans.get(hopID).removeIf(p -> p.closed && !p.hasPlanRef());
+		_plans.get(hopID).removeIf(p -> p.isClosed() && !p.hasPlanRef());
 		
 		//prune dominated plans (e.g., opened plan subsumed by fused plan 
 		//if single consumer of input; however this only applies to fusion
@@ -268,16 +269,21 @@ public class CPlanMemoTable
 			return Collections.emptyList();
 		//return distinct entries wrt type and closed attributes
 		return _plans.get(hopID).stream()
-			.map(p -> TemplateUtils.createTemplate(p.type, p.closed))
+			.map(p -> TemplateUtils.createTemplate(p.type, p.ctype))
 			.distinct().collect(Collectors.toList());
 	}
 	
 	public List<TemplateType> getDistinctTemplateTypes(long hopID, int refAt) {
+		return getDistinctTemplateTypes(hopID, refAt, false);
+	}
+	
+	public List<TemplateType> getDistinctTemplateTypes(long hopID, int refAt, boolean exclInvalOuter) {
 		if(!contains(hopID))
 			return Collections.emptyList();
 		//return distinct template types with reference at given position
 		return _plans.get(hopID).stream()
-			.filter(p -> p.isPlanRef(refAt))
+			.filter(p -> p.isPlanRef(refAt) && (!exclInvalOuter 
+				|| p.type!=TemplateType.OUTER || p.isValid()) )
 			.map(p -> p.type) //extract type
 			.distinct().collect(Collectors.toList());
 	}
@@ -289,7 +295,7 @@ public class CPlanMemoTable
 
 		//single plan per type, get plan w/ best rank in preferred order
 		//but ensure that the plans valid as a top-level plan
-		return tmp.stream().filter(p -> PlanSelection.isValid(p, _hopRefs.get(hopID)))
+		return tmp.stream().filter(p -> p.isValid())
 			.min(Comparator.comparing(p -> p.type.getRank())).orElse(null);
 	}
 	
@@ -319,7 +325,7 @@ public class CPlanMemoTable
 		for( MemoTableEntry me : get(hopID) )
 			for( int i=0; i<3; i++ )
 				if( me.isPlanRef(i) )
-					refs[i] |= me.input(i);
+					refs[i] = me.input(i);
 		return refs;
 	}
 	
@@ -357,17 +363,23 @@ public class CPlanMemoTable
 		public final long input2;
 		public final long input3;
 		public final int size;
-		public boolean closed = false;
+		public CloseType ctype;
 		public MemoTableEntry(TemplateType t, long in1, long in2, long in3, int inlen) {
-			this(t, in1, in2, in3, inlen, false);
+			this(t, in1, in2, in3, inlen, CloseType.OPEN_VALID);
 		}
-		public MemoTableEntry(TemplateType t, long in1, long in2, long in3, int inlen, boolean close) {
+		public MemoTableEntry(TemplateType t, long in1, long in2, long in3, int inlen, CloseType close) {
 			type = t;
 			input1 = in1;
 			input2 = in2;
 			input3 = in3;
 			size = inlen;
-			closed = close;
+			ctype = close;
+		}
+		public boolean isClosed() {
+			return ctype.isClosed();
+		}
+		public boolean isValid() {
+			return ctype.isValid();
 		}
 		public boolean isPlanRef(int index) {
 			return (index==0 && input1 >=0)
@@ -404,7 +416,7 @@ public class CPlanMemoTable
 			h = UtilFunctions.intHashCode(h, Long.hashCode(input2));
 			h = UtilFunctions.intHashCode(h, Long.hashCode(input3));
 			h = UtilFunctions.intHashCode(h, size);
-			h = UtilFunctions.intHashCode(h, Boolean.hashCode(closed));
+			h = UtilFunctions.intHashCode(h, ctype.ordinal());
 			return h;
 		}
 		@Override
@@ -414,7 +426,7 @@ public class CPlanMemoTable
 			MemoTableEntry that = (MemoTableEntry)obj;
 			return type == that.type && input1 == that.input1
 				&& input2 == that.input2 && input3 == that.input3
-				&& size == that.size && closed == that.closed;
+				&& size == that.size && ctype == that.ctype;
 		}
 		@Override
 		public String toString() {
@@ -426,6 +438,8 @@ public class CPlanMemoTable
 					sb.append(",");
 				sb.append(input(i));
 			}
+			if( !isValid() )
+				sb.append(", x");
 			sb.append(")");
 			return sb.toString();
 		}
@@ -439,7 +453,7 @@ public class CPlanMemoTable
 			int pos = (c != null) ? hop.getInput().indexOf(c) : -1;
 			int size = (hop instanceof IndexingOp) ? 1 : hop.getInput().size();
 			plans.add(new MemoTableEntry(tpl.getType(), (pos==0)?c.getHopID():-1,
-				(pos==1)?c.getHopID():-1, (pos==2)?c.getHopID():-1, size, tpl.isClosed()));
+				(pos==1)?c.getHopID():-1, (pos==2)?c.getHopID():-1, size, tpl.getCType()));
 		}
 		
 		public void crossProduct(int pos, Long... refs) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
index b42eecf..9d4ff9b 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
@@ -42,35 +42,46 @@ public abstract class TemplateBase
 	}
 	
 	public enum CloseType {
-		CLOSED_VALID,
-		CLOSED_INVALID,
-		OPEN,
+		CLOSED_VALID,   //no further fusion, valid entry point
+		CLOSED_INVALID, //no further fusion, invalid entry point (to be discarded)
+		OPEN_VALID,     //further fusion allowed, valid entry point
+		OPEN_INVALID;   //further fusion allowed, but invalid entry point
+		public boolean isClosed() {
+			return (this == CLOSED_VALID || this == CloseType.CLOSED_INVALID);
+		}
+		public boolean isValid() {
+			return (this == CLOSED_VALID || this == OPEN_VALID);
+		}
 	}
 
 	protected final TemplateType _type;
-	protected final boolean _closed;
+	protected final CloseType _ctype;
 	
 	protected TemplateBase(TemplateType type) {
-		this(type, false);
+		this(type, CloseType.OPEN_VALID);
 	}
 	
-	protected TemplateBase(TemplateType type, boolean closed) {
+	protected TemplateBase(TemplateType type, CloseType ctype) {
 		_type = type;
-		_closed = closed;
+		_ctype = ctype;
 	}
 	
 	public TemplateType getType() {
 		return _type;
 	}
 	
+	public CloseType getCType() {
+		return _ctype;
+	}
+	
 	public boolean isClosed() {
-		return _closed;
+		return _ctype.isClosed();
 	}
 	
 	@Override
 	public int hashCode() {
 		return UtilFunctions.intHashCode(
-			_type.ordinal(), Boolean.hashCode(_closed));
+			_type.ordinal(), _ctype.ordinal());
 	}
 	
 	@Override
@@ -79,7 +90,7 @@ public abstract class TemplateBase
 			return false;
 		TemplateBase that = (TemplateBase)obj;
 		return _type == that._type 
-			&& _closed == that._closed;
+			&& _ctype == that._ctype;
 	}
 	
 	/////////////////////////////////////////////

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index 2b29ce2..c9b0734 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -63,15 +63,14 @@ public class TemplateCell extends TemplateBase
 		super(TemplateType.CELL);
 	}
 	
-	public TemplateCell(boolean closed) {
-		super(TemplateType.CELL, closed);
+	public TemplateCell(CloseType ctype) {
+		super(TemplateType.CELL, ctype);
 	}
 	
-	public TemplateCell(TemplateType type, boolean closed) {
-		super(type, closed);
+	public TemplateCell(TemplateType type, CloseType ctype) {
+		super(type, ctype);
 	}
 	
-
 	@Override
 	public boolean open(Hop hop) {
 		return hop.dimsKnown() && isValidOperation(hop)
@@ -108,7 +107,7 @@ public class TemplateCell extends TemplateBase
 		else if( hop instanceof AggUnaryOp || hop instanceof AggBinaryOp )
 			return CloseType.CLOSED_INVALID;
 		else
-			return CloseType.OPEN;
+			return CloseType.OPEN_VALID;
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
index ebd6078..740604c 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateMultiAgg.java
@@ -42,11 +42,11 @@ import org.apache.sysml.runtime.matrix.data.Pair;
 public class TemplateMultiAgg extends TemplateCell 
 {	
 	public TemplateMultiAgg() {
-		super(TemplateType.MAGG, false);
+		super(TemplateType.MAGG, CloseType.OPEN_VALID);
 	}
 	
-	public TemplateMultiAgg(boolean closed) {
-		super(TemplateType.MAGG, closed);
+	public TemplateMultiAgg(CloseType ctype) {
+		super(TemplateType.MAGG, ctype);
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
index 904fbb3..f3880b1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
@@ -51,8 +51,8 @@ public class TemplateOuterProduct extends TemplateBase {
 		super(TemplateType.OUTER);
 	}
 	
-	public TemplateOuterProduct(boolean closed) {
-		super(TemplateType.OUTER, closed);
+	public TemplateOuterProduct(CloseType ctype) {
+		super(TemplateType.OUTER, ctype);
 	}
 
 	@Override
@@ -68,7 +68,7 @@ public class TemplateOuterProduct extends TemplateBase {
 			&&((hop instanceof UnaryOp && TemplateUtils.isOperationSupported(hop))  
 			|| (hop instanceof BinaryOp && TemplateUtils.isOperationSupported(hop)
 				&& (TemplateUtils.isBinaryMatrixColVector(hop) || HopRewriteUtils.isBinaryMatrixScalarOperation(hop)
-				|| (HopRewriteUtils.isBinaryMatrixMatrixOperation(hop) && HopRewriteUtils.isBinary(hop, OpOp2.MULT, OpOp2.DIV)) )) 
+				|| (HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)) )) 
 			|| (HopRewriteUtils.isTransposeOperation(hop) && input instanceof AggBinaryOp
 				&& !HopRewriteUtils.isOuterProductLikeMM(input)) 
 			|| (hop instanceof AggBinaryOp && !HopRewriteUtils.isOuterProductLikeMM(hop)
@@ -89,18 +89,24 @@ public class TemplateOuterProduct extends TemplateBase {
 	@Override
 	public CloseType close(Hop hop) {
 		// close on second matrix multiply (after open) or unary aggregate
-		if( hop instanceof AggUnaryOp && HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(0))
+		if( (hop instanceof AggUnaryOp && (HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(0))
+				|| !HopRewriteUtils.isBinarySparseSafe(hop.getInput().get(0))))
 			|| (hop instanceof AggBinaryOp && (HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(0))
-				|| HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(1)))) )
-			return CloseType.CLOSED_INVALID;
+				|| HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(1))
+				|| (!HopRewriteUtils.isOuterProductLikeMM(hop)
+					&& !HopRewriteUtils.isBinarySparseSafe(HopRewriteUtils.getLargestInput(hop))))) )
+ 			return CloseType.CLOSED_INVALID;
 		else if( (hop instanceof AggUnaryOp) 
 			|| (hop instanceof AggBinaryOp && !HopRewriteUtils.isOuterProductLikeMM(hop) 
 					&& !HopRewriteUtils.isTransposeOperation(hop.getParent().get(0)))
 			|| (HopRewriteUtils.isTransposeOperation(hop) && hop.getInput().get(0) instanceof AggBinaryOp
 					&& !HopRewriteUtils.isOuterProductLikeMM(hop.getInput().get(0)) ))
 			return CloseType.CLOSED_VALID;
+		else if( HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)
+			&& HopRewriteUtils.isBinary(hop, OpOp2.MULT, OpOp2.DIV) )
+			return CloseType.OPEN_VALID;
 		else
-			return CloseType.OPEN;
+			return CloseType.OPEN_INVALID;
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 5f14d6b..64014da 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -69,8 +69,8 @@ public class TemplateRow extends TemplateBase
 		super(TemplateType.ROW);
 	}
 	
-	public TemplateRow(boolean closed) {
-		super(TemplateType.ROW, closed);
+	public TemplateRow(CloseType ctype) {
+		super(TemplateType.ROW, ctype);
 	}
 	
 	@Override
@@ -136,8 +136,10 @@ public class TemplateRow extends TemplateBase
 		if(    (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.Row)
 			|| (hop instanceof AggBinaryOp && HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))))
 			return CloseType.CLOSED_VALID;
+		else if( HopRewriteUtils.isTransposeOperation(hop) )
+			return CloseType.OPEN_INVALID;
 		else
-			return CloseType.OPEN;
+			return CloseType.OPEN_VALID;
 	}
 	
 	private static boolean isValidBinaryOperation(Hop hop) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index 4dc0bf2..497dae0 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -42,6 +42,7 @@ import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
+import org.apache.sysml.hops.codegen.template.TemplateBase.CloseType;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
@@ -143,27 +144,27 @@ public class TemplateUtils
 	}
 	
 	public static TemplateBase createTemplate(TemplateType type) {
-		return createTemplate(type, false);
+		return createTemplate(type, CloseType.OPEN_VALID);
 	}
 	
-	public static TemplateBase createTemplate(TemplateType type, boolean closed) {
+	public static TemplateBase createTemplate(TemplateType type, CloseType ctype) {
 		TemplateBase tpl = null;
 		switch( type ) {
-			case CELL: tpl = new TemplateCell(closed); break;
-			case ROW: tpl = new TemplateRow(closed); break;
-			case MAGG: tpl = new TemplateMultiAgg(closed); break;
-			case OUTER: tpl = new TemplateOuterProduct(closed); break;
+			case CELL: tpl = new TemplateCell(ctype); break;
+			case ROW: tpl = new TemplateRow(ctype); break;
+			case MAGG: tpl = new TemplateMultiAgg(ctype); break;
+			case OUTER: tpl = new TemplateOuterProduct(ctype); break;
 		}
 		return tpl;
 	}
 	
-	public static TemplateBase[] createCompatibleTemplates(TemplateType type, boolean closed) {
+	public static TemplateBase[] createCompatibleTemplates(TemplateType type, CloseType ctype) {
 		TemplateBase[] tpl = null;
 		switch( type ) {
-			case CELL: tpl = new TemplateBase[]{new TemplateCell(closed), new TemplateRow(closed)}; break;
-			case ROW: tpl = new TemplateBase[]{new TemplateRow(closed)}; break;
-			case MAGG: tpl = new TemplateBase[]{new TemplateMultiAgg(closed)}; break;
-			case OUTER: tpl = new TemplateBase[]{new TemplateOuterProduct(closed)}; break;
+			case CELL: tpl = new TemplateBase[]{new TemplateCell(ctype), new TemplateRow(ctype)}; break;
+			case ROW: tpl = new TemplateBase[]{new TemplateRow(ctype)}; break;
+			case MAGG: tpl = new TemplateBase[]{new TemplateMultiAgg(ctype)}; break;
+			case OUTER: tpl = new TemplateBase[]{new TemplateOuterProduct(ctype)}; break;
 		}
 		return tpl;
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index d96d1e4..b0f46b7 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -294,6 +294,16 @@ public class HopRewriteUtils
 		return null;
 	}
 	
+	public static Hop getLargestInput(Hop hop) {
+		Hop max = null; long maxSize = -1;
+		for(Hop in : hop.getInput())
+			if(in.getLength() > maxSize) {
+				max = in;
+				maxSize = in.getLength();
+			}
+		return max;
+	}
+	
 	public static Hop createDataGenOp( Hop input, double value ) 
 		throws HopsException
 	{		
@@ -854,7 +864,7 @@ public class HopRewriteUtils
 	public static boolean isBinarySparseSafe(Hop hop) {
 		if( !(hop instanceof BinaryOp) )
 			return false;
-		if( isBinary(hop, OpOp2.MULT) )
+		if( isBinary(hop, OpOp2.MULT, OpOp2.DIV) )
 			return true;
 		BinaryOp bop = (BinaryOp) hop;
 		Hop lit = bop.getInput().get(0) instanceof LiteralOp ? bop.getInput().get(0) :

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/test/java/org/apache/sysml/test/integration/functions/codegen/MiscPatternTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/MiscPatternTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/MiscPatternTest.java
index 75c28eb..b2dfc10 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/MiscPatternTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/MiscPatternTest.java
@@ -38,6 +38,8 @@ public class MiscPatternTest extends AutomatedTestBase
 	private static final String TEST_NAME = "miscPattern";
 	private static final String TEST_NAME1 = TEST_NAME+"1"; //Y + (X * U%*%t(V)) overlapping cell-outer
 	private static final String TEST_NAME2 = TEST_NAME+"2"; //multi-agg w/ large common subexpression 
+	private static final String TEST_NAME3 = TEST_NAME+"3"; //sum((X!=0) * (U %*% t(V) - X)^2) 
+	private static final String TEST_NAME4 = TEST_NAME+"4"; //((X!=0) * (U %*% t(V) - X)) %*% V + Y overlapping row-outer
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + MiscPatternTest.class.getSimpleName() + "/";
@@ -49,11 +51,11 @@ public class MiscPatternTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=2; i++)
+		for(int i=1; i<=4; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
-	@Test	
+	@Test
 	public void testCodegenMiscRewrite1CP() {
 		testCodegenIntegration( TEST_NAME1, true, ExecType.CP );
 	}
@@ -68,7 +70,7 @@ public class MiscPatternTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME1, false, ExecType.SPARK );
 	}
 	
-	@Test	
+	@Test
 	public void testCodegenMiscRewrite2CP() {
 		testCodegenIntegration( TEST_NAME2, true, ExecType.CP );
 	}
@@ -83,6 +85,36 @@ public class MiscPatternTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME2, false, ExecType.SPARK );
 	}
 	
+	@Test
+	public void testCodegenMiscRewrite3CP() {
+		testCodegenIntegration( TEST_NAME3, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenMisc3CP() {
+		testCodegenIntegration( TEST_NAME3, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenMisc3SP() {
+		testCodegenIntegration( TEST_NAME3, false, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCodegenMiscRewrite4CP() {
+		testCodegenIntegration( TEST_NAME4, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenMisc4CP() {
+		testCodegenIntegration( TEST_NAME4, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenMisc4SP() {
+		testCodegenIntegration( TEST_NAME4, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -128,6 +160,9 @@ public class MiscPatternTest extends AutomatedTestBase
 			else if( testname.equals(TEST_NAME2) )
 				Assert.assertTrue(!heavyHittersContainsSubString("spoof", 2)
 					&& !heavyHittersContainsSubString("sp_spoof", 2));
+			else if( testname.equals(TEST_NAME3) || testname.equals(TEST_NAME4) )
+				Assert.assertTrue(heavyHittersContainsSubString("spoofOP", "sp+spoofOP")
+					&& !heavyHittersContainsSubString("ba+*"));
 		}
 		finally {
 			rtplatform = platformOld;

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/test/scripts/functions/codegen/miscPattern3.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/miscPattern3.R b/src/test/scripts/functions/codegen/miscPattern3.R
new file mode 100644
index 0000000..e04ac73
--- /dev/null
+++ b/src/test/scripts/functions/codegen/miscPattern3.R
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = matrix(1, 1100, 2200);
+U = matrix(3, 1100, 10);
+V = matrix(4, 2200, 10)
+X[4:900,3:1000] = matrix(0, 897, 998);
+
+R1 = sum((X!=0) * (U %*% t(V) - X)^2)
+R2 = as.matrix(R1);
+
+writeMM(as(R2, "CsparseMatrix"), paste(args[1], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/test/scripts/functions/codegen/miscPattern3.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/miscPattern3.dml b/src/test/scripts/functions/codegen/miscPattern3.dml
new file mode 100644
index 0000000..593bd8a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/miscPattern3.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(1, 1100, 2200);
+U = matrix(3, 1100, 10);
+V = matrix(4, 2200, 10)
+X[4:900,3:1000] = matrix(0, 897, 998);
+
+while(FALSE){}
+
+R1 = sum((X!=0) * (U %*% t(V) - X)^2)
+
+while(FALSE){}
+
+R2 = as.matrix(R1);
+write(R2, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/test/scripts/functions/codegen/miscPattern4.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/miscPattern4.R b/src/test/scripts/functions/codegen/miscPattern4.R
new file mode 100644
index 0000000..b8ea2e7
--- /dev/null
+++ b/src/test/scripts/functions/codegen/miscPattern4.R
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = matrix(1, 1100, 2200);
+Y = matrix(2, 1100, 10);
+U = matrix(3, 1100, 10);
+V = matrix(4, 2200, 10)
+X[4:900,3:1000] = matrix(0, 897, 998);
+
+R1 = ((X!=0) * (U %*% t(V) - X)) %*% V + Y;
+R2 = as.matrix(sum(R1));
+
+writeMM(as(R2, "CsparseMatrix"), paste(args[1], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/2ca2d8aa/src/test/scripts/functions/codegen/miscPattern4.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/miscPattern4.dml b/src/test/scripts/functions/codegen/miscPattern4.dml
new file mode 100644
index 0000000..c4a93d3
--- /dev/null
+++ b/src/test/scripts/functions/codegen/miscPattern4.dml
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(1, 1100, 2200);
+Y = matrix(2, 1100, 10);
+U = matrix(3, 1100, 10);
+V = matrix(4, 2200, 10)
+X[4:900,3:1000] = matrix(0, 897, 998);
+
+while(FALSE){}
+
+R1 = ((X!=0) * (U %*% t(V) - X)) %*% V + Y;
+
+while(FALSE){}
+
+R2 = as.matrix(sum(R1));
+write(R2, $1)

[43/50] [abbrv] systemml git commit: [SYSTEMML-1979] Improved codegen optimizer (cost model, various fixes)

Posted by re...@apache.org.

[SYSTEMML-1979] Improved codegen optimizer (cost model, various fixes)

This patch makes a number of improvements to the codegen optimizer,
which help to exploit missed fusion potential for Kmeans over large
distributed datasets (i.e., with spark codegen operations). In detail,
this includes the following changes:

1) Eviction-aware cost model: So far we only took the write memory
bandwidth into account. With this change we also account for known
evictions whenever the output and temporary intermediate inputs are
known not to fit into the buffer pool. 

2) Generalized exploration of row fusion plans: This generalization now
allows to fuse matrix-matrix multiplications onto arbitrary row
operations, which allows to fuse the entire Kmeans inner loop if
beneficial.

3) Row sumSq vector primitives: Additionally, we now compile sumSq
vector primitives instead of sum(pow(,2)) which helps to avoid
unnecessary dense row vector intermediates. 

4) Fix missing dense-sparse outer vector operations: So far we only
supported sparse-dense outer vector operations. With the above change
(2), the sparse input can also occur on the right hand side.

5) Fix cost model for unary aggregates: The compute costs for all types
of unary aggregates were incorrectly computed based on the output size
instead of based on the input size.

6) Fix row to cell conversion: This patch also makes some
smaller corrections for the conversion of row to cell templates if there
are no aggregations or vector operations (e.g., only convert if a cell
template exist with exactly the same fusion references).

7) Fix robustness of temporary memory management: We use a preallocated
ring buffer for row vector intermediates of different sizes. This patch
restricts the size of preallocated vectors to 4MB to avoid OOM in case
the large vectors are not used by an operator.

On a 200M x 100, dense input matrix (160GB), this patch improved the
end-to-end runtime of Kmeans (20 iterations, 5 centroids) w/ codegen
from 5319s to 281s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d907efc1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d907efc1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d907efc1

Branch: refs/heads/master
Commit: d907efc17456d7536e1a7344a614aa8a122721ee
Parents: d916ba5
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 29 23:21:30 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Oct 30 18:37:34 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  3 +-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    | 11 +--
 .../opt/PlanSelectionFuseCostBasedV2.java       | 72 +++++++++++++-------
 .../hops/codegen/template/CPlanMemoTable.java   | 10 +++
 .../hops/codegen/template/TemplateRow.java      | 47 ++++++++++---
 .../runtime/codegen/LibSpoofPrimitives.java     | 27 ++++++++
 .../gpu/ConvolutionGPUInstruction.java          |  5 --
 .../functions/codegen/RowAggTmplTest.java       | 18 ++++-
 .../scripts/functions/codegen/rowAggPattern33.R | 36 ++++++++++
 .../functions/codegen/rowAggPattern33.dml       | 33 +++++++++
 10 files changed, 215 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index d188afd..8c3c73d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -77,7 +77,8 @@ public class CNodeBinary extends CNode
 					return sparseLhs ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
 									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				case VECT_OUTERMULT_ADD:
-					return sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+					return  sparseLhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+							sparseRhs ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2v%, %OUT%, %POS1%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
 									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
 				
 				//vector-scalar-add operations

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 3a3dc79..891bfb9 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -30,7 +30,7 @@ public class CNodeUnary extends CNode
 {
 	public enum UnaryType {
 		LOOKUP_R, LOOKUP_C, LOOKUP_RC, LOOKUP0, //codegen specific
-		ROW_SUMS, ROW_MINS, ROW_MAXS, ROW_COUNTNNZS, //codegen specific
+		ROW_SUMS, ROW_SUMSQS, ROW_MINS, ROW_MAXS, ROW_COUNTNNZS, //codegen specific
 		VECT_EXP, VECT_POW2, VECT_MULT2, VECT_SQRT, VECT_LOG,
 		VECT_ABS, VECT_ROUND, VECT_CEIL, VECT_FLOOR, VECT_SIGN, 
 		VECT_SIN, VECT_COS, VECT_TAN, VECT_ASIN, VECT_ACOS, VECT_ATAN, 
@@ -51,6 +51,7 @@ public class CNodeUnary extends CNode
 		public String getTemplate(boolean sparse) {
 			switch( this ) {
 				case ROW_SUMS:
+				case ROW_SUMSQS:
 				case ROW_MINS:
 				case ROW_MAXS:
 				case ROW_COUNTNNZS: {
@@ -242,9 +243,10 @@ public class CNodeUnary extends CNode
 	@Override
 	public String toString() {
 		switch(_type) {
-			case ROW_SUMS:  return "u(R+)";
-			case ROW_MINS:  return "u(Rmin)";
-			case ROW_MAXS:  return "u(Rmax)";
+			case ROW_SUMS:   return "u(R+)";
+			case ROW_SUMSQS: return "u(Rsq+)";
+			case ROW_MINS:   return "u(Rmin)";
+			case ROW_MAXS:   return "u(Rmax)";
 			case ROW_COUNTNNZS: return "u(Rnnz)";
 			case VECT_EXP:
 			case VECT_POW2:
@@ -308,6 +310,7 @@ public class CNodeUnary extends CNode
 				break;
 			
 			case ROW_SUMS:
+			case ROW_SUMSQS:
 			case ROW_MINS:
 			case ROW_MAXS:
 			case ROW_COUNTNNZS:

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index 10875e8..4d8a7bc 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -42,6 +42,7 @@ import org.apache.sysml.hops.AggUnaryOp;
 import org.apache.sysml.hops.BinaryOp;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.DataOpTypes;
 import org.apache.sysml.hops.Hop.Direction;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.IndexingOp;
@@ -60,6 +61,7 @@ import org.apache.sysml.hops.codegen.template.TemplateRow;
 import org.apache.sysml.hops.codegen.template.TemplateUtils;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;
+import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysml.runtime.util.UtilFunctions;
@@ -79,23 +81,24 @@ import org.apache.sysml.utils.Statistics;
  * 
  */
 public class PlanSelectionFuseCostBasedV2 extends PlanSelection
-{	
+{
 	private static final Log LOG = LogFactory.getLog(PlanSelectionFuseCostBasedV2.class.getName());
 	
 	//common bandwidth characteristics, with a conservative write bandwidth in order 
 	//to cover result allocation, write into main memory, and potential evictions
-	private static final double WRITE_BANDWIDTH = 2d*1024*1024*1024;  //2GB/s
-	private static final double READ_BANDWIDTH = 32d*1024*1024*1024;  //32GB/s
-	private static final double READ_BANDWIDTH_BROADCAST = WRITE_BANDWIDTH/4;
-	private static final double COMPUTE_BANDWIDTH = 2d*1024*1024*1024 //2GFLOPs/core
-		* InfrastructureAnalyzer.getLocalParallelism();
+	private static final double WRITE_BANDWIDTH_IO  =      512*1024*1024;  //512MB/s
+	private static final double WRITE_BANDWIDTH_MEM =  2d*1024*1024*1024;  //2GB/s
+	private static final double READ_BANDWIDTH_MEM  = 32d*1024*1024*1024;  //32GB/s
+	private static final double READ_BANDWIDTH_BROADCAST = WRITE_BANDWIDTH_MEM/4;
+	private static final double COMPUTE_BANDWIDTH  =   2d*1024*1024*1024   //1GFLOPs/core
+								* InfrastructureAnalyzer.getLocalParallelism();
 	
 	//sparsity estimate for unknown sparsity to prefer sparse-safe fusion plans
 	private static final double SPARSE_SAFE_SPARSITY_EST = 0.1;
 	
 	//optimizer configuration
 	public static boolean COST_PRUNING = true;
-	public static boolean STRUCTURAL_PRUNING = false;
+	public static boolean STRUCTURAL_PRUNING = true;
 	
 	private static final IDSequence COST_ID = new IDSequence();
 	private static final TemplateRow ROW_TPL = new TemplateRow();
@@ -306,8 +309,8 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 				matTargets.add(hopID);
 				Hop hop = memo.getHopRefs().get(hopID);
 				long size = getSize(hop);
-				costs += size * 8 / WRITE_BANDWIDTH + 
-						size * 8 / READ_BANDWIDTH;
+				costs += size * 8 / WRITE_BANDWIDTH_MEM + 
+						size * 8 / READ_BANDWIDTH_MEM;
 			}
 		}
 		//points with non-partition consumers
@@ -315,7 +318,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			if( !matTargets.contains(hopID) ) {
 				matTargets.add(hopID);
 				Hop hop = memo.getHopRefs().get(hopID);
-				costs += getSize(hop) * 8 / WRITE_BANDWIDTH;
+				costs += getSize(hop) * 8 / WRITE_BANDWIDTH_MEM;
 			}
 		
 		return costs;
@@ -326,7 +329,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		//get partition input reads (at least read once)
 		for( Long hopID : part.getInputs() ) {
 			Hop hop = memo.getHopRefs().get(hopID);
-			costs += getSize(hop) * 8 / READ_BANDWIDTH;
+			costs += getSize(hop) * 8 / READ_BANDWIDTH_MEM;
 		}
 		return costs;
 	}
@@ -335,7 +338,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		double costs = 0;
 		for( Long hopID : R ) {
 			Hop hop = memo.getHopRefs().get(hopID);
-			costs += getSize(hop) * 8 / WRITE_BANDWIDTH;
+			costs += getSize(hop) * 8 / WRITE_BANDWIDTH_MEM;
 		}
 		return costs;
 	}
@@ -345,6 +348,13 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			.mapToDouble(d -> d/COMPUTE_BANDWIDTH).sum();
 	}
 	
+	private static double sumTmpInputOutputSize(CPlanMemoTable memo, CostVector vect) {
+		//size of intermediate inputs and outputs, i.e., output and inputs other than treads
+		return vect.outSize + vect.inSizes.entrySet().stream()
+			.filter(e -> !HopRewriteUtils.isData(memo.getHopRefs().get(e.getKey()), DataOpTypes.TRANSIENTREAD))
+			.mapToDouble(e -> e.getValue()).sum();
+	}
+	
 	private static long getSize(Hop hop) {
 		return Math.max(hop.getDim1(),1) 
 			* Math.max(hop.getDim2(),1);
@@ -593,6 +603,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 	
 	private static boolean isRowAggOp(Hop hop){
 		return (hop instanceof AggUnaryOp || hop instanceof AggBinaryOp
+			|| (hop instanceof IndexingOp && HopRewriteUtils.isColumnRangeIndexing((IndexingOp)hop))
 			|| HopRewriteUtils.isBinary(hop, OpOp2.CBIND));
 	}
 	
@@ -629,7 +640,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		HashSet<Long> refAggs = getRowAggOpsWithRowRef(memo, part);
 		for( Long hopID : part.getPartition() ) {
 			MemoTableEntry me = memo.getBest(hopID, TemplateType.ROW);
-			if( me != null && me.type == TemplateType.ROW && memo.contains(hopID, TemplateType.CELL)
+			if( me != null && me.type == TemplateType.ROW && memo.contains(hopID, me, TemplateType.CELL)
 				&& rIsRowTemplateWithoutAggOrVects(memo, memo.getHopRefs().get(hopID), new HashSet<Long>(), refAggs.contains(hopID)) ) {
 				List<MemoTableEntry> blacklist = memo.get(hopID, TemplateType.ROW); 
 				memo.remove(memo.getHopRefs().get(hopID), new HashSet<>(blacklist));
@@ -829,12 +840,8 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		
 		//add costs for opened fused operator
 		if( opened ) {
-			if( LOG.isTraceEnabled() ) {
-				String type = (best !=null) ? best.type.name() : "HOP";
-				LOG.trace("Cost vector ("+type+" "+currentHopId+"): "+costVect);
-			}
-			double tmpCosts = costVect.outSize * 8 / WRITE_BANDWIDTH
-				+ Math.max(costVect.getInputSize() * 8 / READ_BANDWIDTH,
+			double tmpCosts = costVect.outSize * 8 / WRITE_BANDWIDTH_MEM
+				+ Math.max(costVect.getInputSize() * 8 / READ_BANDWIDTH_MEM,
 				costVect.computeCosts/ COMPUTE_BANDWIDTH);
 			//read correction for distributed computation
 			Hop driver = memo.getHopRefs().get(costVect.getMaxInputSizeHopID());
@@ -843,7 +850,15 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			//sparsity correction for outer-product template (and sparse-safe cell)
 			if( best != null && best.type == TemplateType.OUTER )
 				tmpCosts *= driver.dimsKnown(true) ? driver.getSparsity() : SPARSE_SAFE_SPARSITY_EST;
+			//write correction for known evictions in CP
+			else if( driver.getMemEstimate() < OptimizerUtils.getLocalMemBudget()
+				&& sumTmpInputOutputSize(memo, costVect) > LazyWriteBuffer.getWriteBufferSize() )
+				tmpCosts += costVect.outSize * 8 / WRITE_BANDWIDTH_IO;
 			costs += tmpCosts;
+			if( LOG.isTraceEnabled() ) {
+				String type = (best !=null) ? best.type.name() : "HOP";
+				LOG.trace("Cost vector ("+type+" "+currentHopId+"): "+costVect+" -> "+tmpCosts);
+			}
 		}
 		//add costs for non-partition read in the middle of fused operator
 		else if( part.getExtConsumed().contains(current.getHopID()) ) {
@@ -985,13 +1000,18 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		}
 		else if( current instanceof AggUnaryOp) {
 			switch(((AggUnaryOp)current).getOp()) {
-			case SUM:    costs = 4; break; 
-			case SUM_SQ: costs = 5; break;
-			case MIN:
-			case MAX:    costs = 1; break;
-			default:
-				LOG.warn("Cost model not "
-					+ "implemented yet for: "+((AggUnaryOp)current).getOp());
+				case SUM:    costs = 4; break; 
+				case SUM_SQ: costs = 5; break;
+				case MIN:
+				case MAX:    costs = 1; break;
+				default:
+					LOG.warn("Cost model not "
+						+ "implemented yet for: "+((AggUnaryOp)current).getOp());
+			}
+			switch(((AggUnaryOp)current).getDirection()) {
+				case Col: costs *= Math.max(current.getInput().get(0).getDim1(),1); break;
+				case Row: costs *= Math.max(current.getInput().get(0).getDim2(),1); break;
+				case RowCol: costs *= getSize(current.getInput().get(0)); break;
 			}
 		}
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index 5eedc7b..0c3bb90 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -87,6 +87,11 @@ public class CPlanMemoTable
 			.anyMatch(p -> p.type==type);
 	}
 	
+	public boolean contains(long hopID, MemoTableEntry me, TemplateType type) {
+		return contains(hopID) && get(hopID).stream()
+			.anyMatch(p -> p.type==type && p.equalPlanRefs(me));
+	}
+	
 	public boolean contains(long hopID, boolean checkClose, TemplateType... type) {
 		if( !checkClose && type.length==1 )
 			return contains(hopID, type[0]);
@@ -408,6 +413,11 @@ public class CPlanMemoTable
 			return (input1>=0) ? 0 : (input2>=0) ? 
 				1 : (input3>=0) ? 2 : -1;
 		}
+		public boolean equalPlanRefs(MemoTableEntry that) {
+			return (input1 == that.input1
+				&& input2 == that.input2
+				&& input3 == that.input3);
+		}
 		public long input(int index) {
 			return (index==0) ? input1 : (index==1) ? input2 : input3;
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index e14fbd3..b862abf 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -115,7 +115,8 @@ public class TemplateRow extends TemplateBase
 			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && isFuseSkinnyMatrixMult(hop) //MM
 				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
-			|| isPartOfValidCumAggChain(hop) ); //cum* with transpose
+			|| isPartOfValidCumAggChain(hop) //cum* with transpose
+			|| isPartOfValidTransposeMMChain(hop)); //t(f(X))%*%X
 	}
 
 	@Override
@@ -176,6 +177,22 @@ public class TemplateRow extends TemplateBase
 				&& hop.getInput().get(0).getParent().size()==1);
 		}
 	}
+	
+	private static boolean isPartOfValidTransposeMMChain(Hop hop) {
+		//check if transpose is part of t(f(X))%*%X chain w/ single consumer
+		//for now: we restrict this to tall and skinny matrix multiplications
+		return HopRewriteUtils.isTransposeOperation(hop)
+			&& hop.getParent().size() == 1 
+			&& hop.dimsKnown() && hop.getParent().get(0).dimsKnown()
+			&& hop.getDim2() > 128 * hop.getParent().get(0).getDim1()
+			&& hop.getDim2() > 128 * hop.getParent().get(0).getDim2()
+			&& HopRewriteUtils.isMatrixMultiply(hop.getParent().get(0))
+			&& isFuseSkinnyMatrixMult(hop.getParent().get(0))
+			&& ((hop.getParent().get(0).getInput().get(0) == hop && 
+				HopRewriteUtils.containsInput(hop, hop.getParent().get(0).getInput().get(1)))
+				||(hop.getParent().get(0).getInput().get(1) == hop && 
+				HopRewriteUtils.containsInput(hop, hop.getParent().get(0).getInput().get(0))));
+	}
 
 	@Override
 	public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable memo, boolean compileLiterals) {
@@ -214,7 +231,7 @@ public class TemplateRow extends TemplateBase
 	}
 
 	private void rConstructCplan(Hop hop, CPlanMemoTable memo, HashMap<Long, CNode> tmp, HashSet<Hop> inHops, HashMap<String, Hop> inHops2, boolean compileLiterals) 
-	{	
+	{
 		//memoization for common subexpression elimination and to avoid redundant work 
 		if( tmp.containsKey(hop.getHopID()) )
 			return;
@@ -240,15 +257,20 @@ public class TemplateRow extends TemplateBase
 			if( ((AggUnaryOp)hop).getDirection() == Direction.Row && HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
 				if(hop.getInput().get(0).getDim2()==1)
 					out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP_R);
-				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM) 
+				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM)
 					&& HopRewriteUtils.isBinaryMatrixScalar(hop.getInput().get(0), OpOp2.NOTEQUAL, 0)
 					&& cdata1 instanceof CNodeBinary ) {
 					out = new CNodeUnary(cdata1.getInput().get(0), UnaryType.ROW_COUNTNNZS);
 				}
+				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM)
+					&& HopRewriteUtils.isBinaryMatrixScalar(hop.getInput().get(0), OpOp2.POW, 2)
+					&& cdata1 instanceof CNodeBinary ) {
+					out = new CNodeUnary(cdata1.getInput().get(0), UnaryType.ROW_SUMSQS);
+				}
 				else {
 					String opcode = "ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
 					out = new CNodeUnary(cdata1, UnaryType.valueOf(opcode));
-					if( cdata1 instanceof CNodeData && inHops2.isEmpty() )
+					if( cdata1 instanceof CNodeData && !inHops2.containsKey("X") )
 						inHops2.put("X", hop.getInput().get(0));
 				}
 			}
@@ -275,17 +297,22 @@ public class TemplateRow extends TemplateBase
 				//correct input under transpose
 				cdata1 = TemplateUtils.skipTranspose(cdata1, hop.getInput().get(0), tmp, compileLiterals);
 				inHops.remove(hop.getInput().get(0));
-				inHops.add(hop.getInput().get(0).getInput().get(0));
+				if( cdata1 instanceof CNodeData )
+					inHops.add(hop.getInput().get(0).getInput().get(0));
 				
 				//note: vectorMultAdd applicable to vector-scalar, and vector-vector
 				if( hop.getInput().get(1).getDim2() == 1 )
 					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
 				else {
 					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_OUTERMULT_ADD);
-					if( !inHops2.containsKey("B1") )
+					if( !inHops2.containsKey("B1") ) { //incl modification of X for consistency
+						if( cdata1 instanceof CNodeData )
+							inHops2.put("X", hop.getInput().get(0).getInput().get(0));
 						inHops2.put("B1", hop.getInput().get(1));
+					}
 				}
-				inHops2.put("X", hop.getInput().get(0).getInput().get(0));
+				if( !inHops2.containsKey("X") )
+					inHops2.put("X", hop.getInput().get(0).getInput().get(0));
 			}
 			else
 			{
@@ -321,7 +348,7 @@ public class TemplateRow extends TemplateBase
 				if( HopRewriteUtils.isUnary(hop, SUPPORTED_VECT_UNARY) ) {
 					String opname = "VECT_"+((UnaryOp)hop).getOp().name();
 					out = new CNodeUnary(cdata1, UnaryType.valueOf(opname));
-					if( cdata1 instanceof CNodeData && inHops2.isEmpty() )
+					if( cdata1 instanceof CNodeData && !inHops2.containsKey("X") )
 						inHops2.put("X", hop.getInput().get(0));
 				}
 				else 
@@ -350,7 +377,7 @@ public class TemplateRow extends TemplateBase
 				cdata2 = TemplateUtils.wrapLookupIfNecessary(cdata2, hop.getInput().get(1));
 			}
 			out = new CNodeBinary(cdata1, cdata2, BinType.VECT_CBIND);
-			if( cdata1 instanceof CNodeData )
+			if( cdata1 instanceof CNodeData && !inHops2.containsKey("X") )
 				inHops2.put("X", hop.getInput().get(0));
 		}
 		else if(hop instanceof BinaryOp)
@@ -379,7 +406,7 @@ public class TemplateRow extends TemplateBase
 							cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
 						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
 					}
-					if( cdata1 instanceof CNodeData && inHops2.isEmpty()
+					if( cdata1 instanceof CNodeData && !inHops2.containsKey("X")
 						&& !(cdata1.getDataType()==DataType.SCALAR) ) {
 						inHops2.put("X", hop.getInput().get(0));
 					}

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 356c729..1d56f1c 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -118,6 +118,19 @@ public class LibSpoofPrimitives
 		}
 	}
 	
+	public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int ai, int[] bix, int bi, int ci, int blen, int len1, int len2) {
+		if( isFlipOuter(len1, len2) ) {
+			for( int i=bi; i<bi+blen; i++ ) {
+				final int cix = ci + bix[i] * len1;
+				LibMatrixMult.vectMultiplyAdd(b[i], a, c, ai, cix, len1);
+			}
+		}
+		else {
+			for( int i=0, cix=ci; i < len1; i++, cix+=len2 )
+				LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bix, bi, cix, blen);
+		}
+	}
+	
 	public static void vectMultAdd(double[] a, double bval, double[] c, int bi, int ci, int len) {
 		if( a == null || bval == 0 ) return;
 		LibMatrixMult.vectMultiplyAdd(bval, a, c, bi, ci, len);
@@ -257,6 +270,14 @@ public class LibSpoofPrimitives
 		return vectSum(avals, ai, alen);
 	}
 	
+	public static double vectSumsq(double[] a, int ai, int len) { 
+		return LibMatrixMult.dotProduct(a, a, ai, ai, len);
+	}
+	
+	public static double vectSumsq(double[] avals, int[] aix, int ai, int alen, int len) {
+		return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen);
+	}
+	
 	public static double vectMin(double[] a, int ai, int len) { 
 		double val = Double.MAX_VALUE;
 		for( int i = ai; i < ai+len; i++ )
@@ -1837,12 +1858,18 @@ public class LibSpoofPrimitives
 	 * vectors of different sizes are interspersed.
 	 */
 	private static class VectorBuffer {
+		private static final int MAX_SIZE = 512*1024; //4MB
 		private final double[][] _data;
 		private int _pos;
 		private int _len1;
 		private int _len2;
 		
 		public VectorBuffer(int num, int len1, int len2) {
+			//best effort size restriction since large intermediates
+			//not necessarily used (num refers to the total number)
+			len1 = Math.min(len1, MAX_SIZE);
+			len2 = Math.min(len2, MAX_SIZE);
+			//pre-allocate ring buffer
 			int lnum = (len2>0 && len1!=len2) ? 2*num : num;
 			_data = new double[lnum][];
 			for( int i=0; i<num; i++ ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index fdb208e..62a20b8 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -20,17 +20,12 @@ package org.apache.sysml.runtime.instructions.gpu;
 
 import java.util.ArrayList;
 
-import jcuda.Pointer;
-
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
-import org.apache.sysml.runtime.instructions.cp.ConvolutionCPInstruction;
-import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 78305e3..5d2015f 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -69,6 +69,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME30 = TEST_NAME+"30"; //Mlogreg inner core, multi-class
 	private static final String TEST_NAME31 = TEST_NAME+"31"; //MLogreg - matrix-vector cbind 0s generalized
 	private static final String TEST_NAME32 = TEST_NAME+"32"; //X[, 1] - rowSums(X)
+	private static final String TEST_NAME33 = TEST_NAME+"33"; //Kmeans, inner loop
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -80,7 +81,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=32; i++)
+		for(int i=1; i<=33; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -564,6 +565,21 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME32, false, ExecType.SPARK );
 	}
 	
+	@Test
+	public void testCodegenRowAggRewrite33CP() {
+		testCodegenIntegration( TEST_NAME33, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg33CP() {
+		testCodegenIntegration( TEST_NAME33, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg33SP() {
+		testCodegenIntegration( TEST_NAME33, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/test/scripts/functions/codegen/rowAggPattern33.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern33.R b/src/test/scripts/functions/codegen/rowAggPattern33.R
new file mode 100644
index 0000000..2d2490d
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern33.R
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 600, 10, byrow=TRUE);
+C = matrix(seq(1,40)/40, 4, 10, byrow=TRUE);
+
+D = -2 * (X %*% t(C)) + matrix(1,nrow(X),1)%*%t(rowSums (C ^ 2));
+P = (D <= (rowMins (D) %*% matrix(1, 1, ncol(D))));
+P = P / rowSums (P);
+P_denom = colSums (P);
+R = (t(P) %*% X) / P_denom%*%matrix(1,1,ncol(X));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/d907efc1/src/test/scripts/functions/codegen/rowAggPattern33.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern33.dml b/src/test/scripts/functions/codegen/rowAggPattern33.dml
new file mode 100644
index 0000000..54c277c
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern33.dml
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 600, 10);
+C = matrix(seq(1,40)/40, 4, 10);
+while(FALSE){}
+
+D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+P = D <= rowMins(D);
+P = P / rowSums (P);
+P_denom = colSums (P);
+R = (t(P) %*% X) / t(P_denom);
+
+write(R, $1)

[29/50] [abbrv] systemml git commit: [SYSTEMML-1969] Support single-precision operations on GPU backend

Posted by re...@apache.org.

http://git-wip-us.apache.org/repos/asf/systemml/blob/abbffc55/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 73b057e..d382fc5 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -1,8 +1,8 @@
 //
 // Generated by NVIDIA NVVM Compiler
 //
-// Compiler Build ID: CL-21124049
-// Cuda compilation tools, release 8.0, V8.0.44
+// Compiler Build ID: CL-21554848
+// Cuda compilation tools, release 8.0, V8.0.61
 // Based on LLVM 3.4svn
 //
 
@@ -10,7 +10,7 @@
 .target sm_30
 .address_size 64
 
-	// .globl	slice_sparse_dense_row
+	// .globl	double2float_f
 .func  (.param .b64 func_retval0) __internal_trig_reduction_slowpathd
 (
 	.param .b64 __internal_trig_reduction_slowpathd_param_0,
@@ -23,20 +23,97 @@
 	.param .b64 __internal_accurate_pow_param_1
 )
 ;
-.extern .shared .align 8 .b8 sdata[];
+.extern .shared .align 1 .b8 my_sdata[];
+.const .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_i2opi_d[144] = {8, 93, 141, 31, 177, 95, 251, 107, 234, 146, 82, 138, 247, 57, 7, 61, 123, 241, 229, 235, 199, 186, 39, 117, 45, 234, 95, 158, 102, 63, 70, 79, 183, 9, 203, 39, 207, 126, 54, 109, 31, 109, 10, 90, 139, 17, 47, 239, 15, 152, 5, 222, 255, 151, 248, 31, 59, 40, 249, 189, 139, 95, 132, 156, 244, 57, 83, 131, 57, 214, 145, 57, 65, 126, 95, 180, 38, 112, 156, 233, 132, 68, 187, 46, 245, 53, 130, 232, 62, 167, 41, 177, 28, 235, 29, 254, 28, 146, 209, 9, 234, 46, 73, 6, 224, 210, 77, 66, 58, 110, 36, 183, 97, 197, 187, 222, 171, 99, 81, 254, 65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162};
 .const .align 8 .b8 __cudart_sin_cos_coeffs[128] = {186, 94, 120, 249, 101, 219, 229, 61, 70, 210, 176, 44, 241, 229, 90, 190, 146, 227, 172, 105, 227, 29, 199, 62, 161, 98, 219, 25, 160, 1, 42, 191, 24, 8, 17, 17, 17, 17, 129, 63, 84, 85, 85, 85, 85, 85, 197, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 129, 253, 32, 131, 255, 168, 189, 40, 133, 239, 193, 167, 238, 33, 62, 217, 230, 6, 142, 79, 126, 146, 190, 233, 188, 221, 25, 160, 1, 250, 62, 71, 93, 193, 22, 108, 193, 86, 191, 81, 85, 85, 85, 85, 85, 165, 63, 0, 0, 0, 0, 0, 0, 224, 191, 0, 0, 0, 0, 0, 0, 240, 63};
 
-.visible .entry slice_sparse_dense_row(
-	.param .u64 slice_sparse_dense_row_param_0,
-	.param .u64 slice_sparse_dense_row_param_1,
-	.param .u64 slice_sparse_dense_row_param_2,
-	.param .u64 slice_sparse_dense_row_param_3,
-	.param .u32 slice_sparse_dense_row_param_4,
-	.param .u32 slice_sparse_dense_row_param_5,
-	.param .u32 slice_sparse_dense_row_param_6,
-	.param .u32 slice_sparse_dense_row_param_7,
-	.param .u32 slice_sparse_dense_row_param_8
+.visible .entry double2float_f(
+	.param .u64 double2float_f_param_0,
+	.param .u64 double2float_f_param_1,
+	.param .u32 double2float_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [double2float_f_param_0];
+	ld.param.u64 	%rd2, [double2float_f_param_1];
+	ld.param.u32 	%r2, [double2float_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 8;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f64 	%fd1, [%rd5];
+	cvt.rn.f32.f64	%f1, %fd1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB0_2:
+	ret;
+}
+
+	// .globl	float2double_f
+.visible .entry float2double_f(
+	.param .u64 float2double_f_param_0,
+	.param .u64 float2double_f_param_1,
+	.param .u32 float2double_f_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<6>;
+	.reg .f64 	%fd<2>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [float2double_f_param_0];
+	ld.param.u64 	%rd2, [float2double_f_param_1];
+	ld.param.u32 	%r2, [float2double_f_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r3, %r5;
+	setp.ge.s32	%p1, %r1, %r2;
+	@%p1 bra 	BB1_2;
+
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 8;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f64 	[%rd8], %fd1;
+
+BB1_2:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_d
+.visible .entry slice_sparse_dense_row_d(
+	.param .u64 slice_sparse_dense_row_d_param_0,
+	.param .u64 slice_sparse_dense_row_d_param_1,
+	.param .u64 slice_sparse_dense_row_d_param_2,
+	.param .u64 slice_sparse_dense_row_d_param_3,
+	.param .u32 slice_sparse_dense_row_d_param_4,
+	.param .u32 slice_sparse_dense_row_d_param_5,
+	.param .u32 slice_sparse_dense_row_d_param_6,
+	.param .u32 slice_sparse_dense_row_d_param_7,
+	.param .u32 slice_sparse_dense_row_d_param_8
 )
 {
 	.reg .pred 	%p<7>;
@@ -45,22 +122,22 @@
 	.reg .b64 	%rd<23>;
 
 
-	ld.param.u64 	%rd9, [slice_sparse_dense_row_param_0];
-	ld.param.u64 	%rd10, [slice_sparse_dense_row_param_1];
-	ld.param.u64 	%rd11, [slice_sparse_dense_row_param_2];
-	ld.param.u64 	%rd12, [slice_sparse_dense_row_param_3];
-	ld.param.u32 	%r15, [slice_sparse_dense_row_param_4];
-	ld.param.u32 	%r16, [slice_sparse_dense_row_param_5];
-	ld.param.u32 	%r12, [slice_sparse_dense_row_param_6];
-	ld.param.u32 	%r13, [slice_sparse_dense_row_param_7];
-	ld.param.u32 	%r14, [slice_sparse_dense_row_param_8];
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_d_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_d_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_d_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_d_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_d_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_d_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_d_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_d_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_d_param_8];
 	mov.u32 	%r17, %ntid.x;
 	mov.u32 	%r18, %ctaid.x;
 	mov.u32 	%r19, %tid.x;
 	mad.lo.s32 	%r1, %r17, %r18, %r19;
 	add.s32 	%r2, %r1, %r15;
 	setp.gt.s32	%p1, %r2, %r16;
-	@%p1 bra 	BB0_6;
+	@%p1 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd13, %rd10;
 	mul.wide.s32 	%rd14, %r2, 4;
@@ -68,7 +145,7 @@
 	ld.global.u32 	%r23, [%rd1];
 	ld.global.u32 	%r22, [%rd1+4];
 	setp.ge.s32	%p2, %r23, %r22;
-	@%p2 bra 	BB0_6;
+	@%p2 bra 	BB2_6;
 
 	cvta.to.global.u64 	%rd2, %rd12;
 	cvta.to.global.u64 	%rd15, %rd9;
@@ -80,12 +157,12 @@
 	mul.wide.s32 	%rd18, %r23, 4;
 	add.s64 	%rd21, %rd16, %rd18;
 
-BB0_3:
+BB2_3:
 	ld.global.u32 	%r8, [%rd21];
 	setp.lt.s32	%p3, %r8, %r12;
 	setp.gt.s32	%p4, %r8, %r13;
 	or.pred  	%p5, %p3, %p4;
-	@%p5 bra 	BB0_5;
+	@%p5 bra 	BB2_5;
 
 	ld.global.f64 	%fd1, [%rd22];
 	add.s32 	%r21, %r5, %r8;
@@ -94,28 +171,106 @@ BB0_3:
 	st.global.f64 	[%rd20], %fd1;
 	ld.global.u32 	%r22, [%rd1+4];
 
-BB0_5:
+BB2_5:
 	add.s64 	%rd22, %rd22, 8;
 	add.s64 	%rd21, %rd21, 4;
 	add.s32 	%r23, %r23, 1;
 	setp.lt.s32	%p6, %r23, %r22;
-	@%p6 bra 	BB0_3;
+	@%p6 bra 	BB2_3;
+
+BB2_6:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_row_f
+.visible .entry slice_sparse_dense_row_f(
+	.param .u64 slice_sparse_dense_row_f_param_0,
+	.param .u64 slice_sparse_dense_row_f_param_1,
+	.param .u64 slice_sparse_dense_row_f_param_2,
+	.param .u64 slice_sparse_dense_row_f_param_3,
+	.param .u32 slice_sparse_dense_row_f_param_4,
+	.param .u32 slice_sparse_dense_row_f_param_5,
+	.param .u32 slice_sparse_dense_row_f_param_6,
+	.param .u32 slice_sparse_dense_row_f_param_7,
+	.param .u32 slice_sparse_dense_row_f_param_8
+)
+{
+	.reg .pred 	%p<7>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd9, [slice_sparse_dense_row_f_param_0];
+	ld.param.u64 	%rd10, [slice_sparse_dense_row_f_param_1];
+	ld.param.u64 	%rd11, [slice_sparse_dense_row_f_param_2];
+	ld.param.u64 	%rd12, [slice_sparse_dense_row_f_param_3];
+	ld.param.u32 	%r15, [slice_sparse_dense_row_f_param_4];
+	ld.param.u32 	%r16, [slice_sparse_dense_row_f_param_5];
+	ld.param.u32 	%r12, [slice_sparse_dense_row_f_param_6];
+	ld.param.u32 	%r13, [slice_sparse_dense_row_f_param_7];
+	ld.param.u32 	%r14, [slice_sparse_dense_row_f_param_8];
+	mov.u32 	%r17, %ntid.x;
+	mov.u32 	%r18, %ctaid.x;
+	mov.u32 	%r19, %tid.x;
+	mad.lo.s32 	%r1, %r17, %r18, %r19;
+	add.s32 	%r2, %r1, %r15;
+	setp.gt.s32	%p1, %r2, %r16;
+	@%p1 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd13, %rd10;
+	mul.wide.s32 	%rd14, %r2, 4;
+	add.s64 	%rd1, %rd13, %rd14;
+	ld.global.u32 	%r23, [%rd1];
+	ld.global.u32 	%r22, [%rd1+4];
+	setp.ge.s32	%p2, %r23, %r22;
+	@%p2 bra 	BB3_6;
+
+	cvta.to.global.u64 	%rd2, %rd12;
+	cvta.to.global.u64 	%rd15, %rd9;
+	cvta.to.global.u64 	%rd16, %rd11;
+	mul.lo.s32 	%r20, %r1, %r14;
+	sub.s32 	%r5, %r20, %r12;
+	mul.wide.s32 	%rd17, %r23, 4;
+	add.s64 	%rd21, %rd15, %rd17;
+	add.s64 	%rd20, %rd16, %rd17;
+
+BB3_3:
+	ld.global.u32 	%r8, [%rd20];
+	setp.lt.s32	%p3, %r8, %r12;
+	setp.gt.s32	%p4, %r8, %r13;
+	or.pred  	%p5, %p3, %p4;
+	@%p5 bra 	BB3_5;
+
+	ld.global.f32 	%f1, [%rd21];
+	add.s32 	%r21, %r5, %r8;
+	mul.wide.s32 	%rd18, %r21, 4;
+	add.s64 	%rd19, %rd2, %rd18;
+	st.global.f32 	[%rd19], %f1;
+	ld.global.u32 	%r22, [%rd1+4];
+
+BB3_5:
+	add.s64 	%rd21, %rd21, 4;
+	add.s64 	%rd20, %rd20, 4;
+	add.s32 	%r23, %r23, 1;
+	setp.lt.s32	%p6, %r23, %r22;
+	@%p6 bra 	BB3_3;
 
-BB0_6:
+BB3_6:
 	ret;
 }
 
-	// .globl	slice_sparse_dense_nnz
-.visible .entry slice_sparse_dense_nnz(
-	.param .u64 slice_sparse_dense_nnz_param_0,
-	.param .u64 slice_sparse_dense_nnz_param_1,
-	.param .u64 slice_sparse_dense_nnz_param_2,
-	.param .u64 slice_sparse_dense_nnz_param_3,
-	.param .u32 slice_sparse_dense_nnz_param_4,
-	.param .u32 slice_sparse_dense_nnz_param_5,
-	.param .u32 slice_sparse_dense_nnz_param_6,
-	.param .u32 slice_sparse_dense_nnz_param_7,
-	.param .u32 slice_sparse_dense_nnz_param_8
+	// .globl	slice_sparse_dense_nnz_d
+.visible .entry slice_sparse_dense_nnz_d(
+	.param .u64 slice_sparse_dense_nnz_d_param_0,
+	.param .u64 slice_sparse_dense_nnz_d_param_1,
+	.param .u64 slice_sparse_dense_nnz_d_param_2,
+	.param .u64 slice_sparse_dense_nnz_d_param_3,
+	.param .u32 slice_sparse_dense_nnz_d_param_4,
+	.param .u32 slice_sparse_dense_nnz_d_param_5,
+	.param .u32 slice_sparse_dense_nnz_d_param_6,
+	.param .u32 slice_sparse_dense_nnz_d_param_7,
+	.param .u32 slice_sparse_dense_nnz_d_param_8
 )
 {
 	.reg .pred 	%p<6>;
@@ -124,15 +279,15 @@ BB0_6:
 	.reg .b64 	%rd<22>;
 
 
-	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_param_0];
-	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_param_1];
-	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_param_2];
-	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_param_3];
-	ld.param.u32 	%r5, [slice_sparse_dense_nnz_param_4];
-	ld.param.u32 	%r9, [slice_sparse_dense_nnz_param_5];
-	ld.param.u32 	%r6, [slice_sparse_dense_nnz_param_6];
-	ld.param.u32 	%r7, [slice_sparse_dense_nnz_param_7];
-	ld.param.u32 	%r8, [slice_sparse_dense_nnz_param_8];
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_d_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_d_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_d_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_d_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_d_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_d_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_d_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_d_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_d_param_8];
 	mov.u32 	%r10, %ntid.x;
 	mov.u32 	%r11, %ctaid.x;
 	mov.u32 	%r12, %tid.x;
@@ -146,7 +301,7 @@ BB0_6:
 	add.s64 	%rd12, %rd1, %rd11;
 	ld.global.u32 	%r15, [%rd12+4];
 	setp.ge.s32	%p1, %r1, %r15;
-	@%p1 bra 	BB1_5;
+	@%p1 bra 	BB4_5;
 
 	cvta.to.global.u64 	%rd2, %rd7;
 	cvta.to.global.u64 	%rd3, %rd5;
@@ -158,11 +313,11 @@ BB0_6:
 	setp.lt.s32	%p2, %r2, %r6;
 	setp.gt.s32	%p3, %r2, %r7;
 	or.pred  	%p4, %p2, %p3;
-	@%p4 bra 	BB1_5;
+	@%p4 bra 	BB4_5;
 
 	mov.u32 	%r21, %r5;
 
-BB1_3:
+BB4_3:
 	mov.u32 	%r3, %r21;
 	add.s32 	%r4, %r3, 1;
 	mul.wide.s32 	%rd16, %r4, 4;
@@ -170,7 +325,7 @@ BB1_3:
 	ld.global.u32 	%r16, [%rd17];
 	setp.le.s32	%p5, %r16, %r1;
 	mov.u32 	%r21, %r4;
-	@%p5 bra 	BB1_3;
+	@%p5 bra 	BB4_3;
 
 	shl.b64 	%rd18, %rd4, 3;
 	add.s64 	%rd19, %rd3, %rd18;
@@ -183,21 +338,103 @@ BB1_3:
 	add.s64 	%rd21, %rd2, %rd20;
 	st.global.f64 	[%rd21], %fd1;
 
-BB1_5:
+BB4_5:
+	ret;
+}
+
+	// .globl	slice_sparse_dense_nnz_f
+.visible .entry slice_sparse_dense_nnz_f(
+	.param .u64 slice_sparse_dense_nnz_f_param_0,
+	.param .u64 slice_sparse_dense_nnz_f_param_1,
+	.param .u64 slice_sparse_dense_nnz_f_param_2,
+	.param .u64 slice_sparse_dense_nnz_f_param_3,
+	.param .u32 slice_sparse_dense_nnz_f_param_4,
+	.param .u32 slice_sparse_dense_nnz_f_param_5,
+	.param .u32 slice_sparse_dense_nnz_f_param_6,
+	.param .u32 slice_sparse_dense_nnz_f_param_7,
+	.param .u32 slice_sparse_dense_nnz_f_param_8
+)
+{
+	.reg .pred 	%p<6>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<22>;
+	.reg .b64 	%rd<22>;
+
+
+	ld.param.u64 	%rd5, [slice_sparse_dense_nnz_f_param_0];
+	ld.param.u64 	%rd8, [slice_sparse_dense_nnz_f_param_1];
+	ld.param.u64 	%rd6, [slice_sparse_dense_nnz_f_param_2];
+	ld.param.u64 	%rd7, [slice_sparse_dense_nnz_f_param_3];
+	ld.param.u32 	%r5, [slice_sparse_dense_nnz_f_param_4];
+	ld.param.u32 	%r9, [slice_sparse_dense_nnz_f_param_5];
+	ld.param.u32 	%r6, [slice_sparse_dense_nnz_f_param_6];
+	ld.param.u32 	%r7, [slice_sparse_dense_nnz_f_param_7];
+	ld.param.u32 	%r8, [slice_sparse_dense_nnz_f_param_8];
+	mov.u32 	%r10, %ntid.x;
+	mov.u32 	%r11, %ctaid.x;
+	mov.u32 	%r12, %tid.x;
+	mad.lo.s32 	%r13, %r10, %r11, %r12;
+	cvta.to.global.u64 	%rd1, %rd8;
+	mul.wide.s32 	%rd9, %r5, 4;
+	add.s64 	%rd10, %rd1, %rd9;
+	ld.global.u32 	%r14, [%rd10];
+	add.s32 	%r1, %r13, %r14;
+	mul.wide.s32 	%rd11, %r9, 4;
+	add.s64 	%rd12, %rd1, %rd11;
+	ld.global.u32 	%r15, [%rd12+4];
+	setp.ge.s32	%p1, %r1, %r15;
+	@%p1 bra 	BB5_5;
+
+	cvta.to.global.u64 	%rd2, %rd7;
+	cvta.to.global.u64 	%rd3, %rd5;
+	cvta.to.global.u64 	%rd13, %rd6;
+	cvt.s64.s32	%rd4, %r1;
+	mul.wide.s32 	%rd14, %r1, 4;
+	add.s64 	%rd15, %rd13, %rd14;
+	ld.global.u32 	%r2, [%rd15];
+	setp.lt.s32	%p2, %r2, %r6;
+	setp.gt.s32	%p3, %r2, %r7;
+	or.pred  	%p4, %p2, %p3;
+	@%p4 bra 	BB5_5;
+
+	mov.u32 	%r21, %r5;
+
+BB5_3:
+	mov.u32 	%r3, %r21;
+	add.s32 	%r4, %r3, 1;
+	mul.wide.s32 	%rd16, %r4, 4;
+	add.s64 	%rd17, %rd1, %rd16;
+	ld.global.u32 	%r16, [%rd17];
+	setp.le.s32	%p5, %r16, %r1;
+	mov.u32 	%r21, %r4;
+	@%p5 bra 	BB5_3;
+
+	shl.b64 	%rd18, %rd4, 2;
+	add.s64 	%rd19, %rd3, %rd18;
+	ld.global.f32 	%f1, [%rd19];
+	sub.s32 	%r17, %r3, %r5;
+	mul.lo.s32 	%r18, %r17, %r8;
+	sub.s32 	%r19, %r18, %r6;
+	add.s32 	%r20, %r19, %r2;
+	mul.wide.s32 	%rd20, %r20, 4;
+	add.s64 	%rd21, %rd2, %rd20;
+	st.global.f32 	[%rd21], %f1;
+
+BB5_5:
 	ret;
 }
 
-	// .globl	slice_dense_dense
-.visible .entry slice_dense_dense(
-	.param .u64 slice_dense_dense_param_0,
-	.param .u64 slice_dense_dense_param_1,
-	.param .u32 slice_dense_dense_param_2,
-	.param .u32 slice_dense_dense_param_3,
-	.param .u32 slice_dense_dense_param_4,
-	.param .u32 slice_dense_dense_param_5,
-	.param .u32 slice_dense_dense_param_6,
-	.param .u32 slice_dense_dense_param_7,
-	.param .u32 slice_dense_dense_param_8
+	// .globl	slice_dense_dense_d
+.visible .entry slice_dense_dense_d(
+	.param .u64 slice_dense_dense_d_param_0,
+	.param .u64 slice_dense_dense_d_param_1,
+	.param .u32 slice_dense_dense_d_param_2,
+	.param .u32 slice_dense_dense_d_param_3,
+	.param .u32 slice_dense_dense_d_param_4,
+	.param .u32 slice_dense_dense_d_param_5,
+	.param .u32 slice_dense_dense_d_param_6,
+	.param .u32 slice_dense_dense_d_param_7,
+	.param .u32 slice_dense_dense_d_param_8
 )
 {
 	.reg .pred 	%p<4>;
@@ -206,13 +443,13 @@ BB1_5:
 	.reg .b64 	%rd<9>;
 
 
-	ld.param.u64 	%rd1, [slice_dense_dense_param_0];
-	ld.param.u64 	%rd2, [slice_dense_dense_param_1];
-	ld.param.u32 	%r3, [slice_dense_dense_param_2];
-	ld.param.u32 	%r4, [slice_dense_dense_param_4];
-	ld.param.u32 	%r5, [slice_dense_dense_param_6];
-	ld.param.u32 	%r7, [slice_dense_dense_param_7];
-	ld.param.u32 	%r6, [slice_dense_dense_param_8];
+	ld.param.u64 	%rd1, [slice_dense_dense_d_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_d_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_d_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_d_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_d_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_d_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_d_param_8];
 	mov.u32 	%r8, %ctaid.x;
 	mov.u32 	%r9, %ntid.x;
 	mov.u32 	%r10, %tid.x;
@@ -221,10 +458,10 @@ BB1_5:
 	setp.lt.s32	%p1, %r2, %r7;
 	setp.gt.s32	%p2, %r6, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB2_2;
-	bra.uni 	BB2_1;
+	@!%p3 bra 	BB6_2;
+	bra.uni 	BB6_1;
 
-BB2_1:
+BB6_1:
 	rem.s32 	%r11, %r1, %r6;
 	cvta.to.global.u64 	%rd3, %rd1;
 	add.s32 	%r12, %r2, %r3;
@@ -238,15 +475,70 @@ BB2_1:
 	add.s64 	%rd8, %rd6, %rd7;
 	st.global.f64 	[%rd8], %fd1;
 
-BB2_2:
+BB6_2:
+	ret;
+}
+
+	// .globl	slice_dense_dense_f
+.visible .entry slice_dense_dense_f(
+	.param .u64 slice_dense_dense_f_param_0,
+	.param .u64 slice_dense_dense_f_param_1,
+	.param .u32 slice_dense_dense_f_param_2,
+	.param .u32 slice_dense_dense_f_param_3,
+	.param .u32 slice_dense_dense_f_param_4,
+	.param .u32 slice_dense_dense_f_param_5,
+	.param .u32 slice_dense_dense_f_param_6,
+	.param .u32 slice_dense_dense_f_param_7,
+	.param .u32 slice_dense_dense_f_param_8
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<15>;
+	.reg .b64 	%rd<9>;
+
+
+	ld.param.u64 	%rd1, [slice_dense_dense_f_param_0];
+	ld.param.u64 	%rd2, [slice_dense_dense_f_param_1];
+	ld.param.u32 	%r3, [slice_dense_dense_f_param_2];
+	ld.param.u32 	%r4, [slice_dense_dense_f_param_4];
+	ld.param.u32 	%r5, [slice_dense_dense_f_param_6];
+	ld.param.u32 	%r7, [slice_dense_dense_f_param_7];
+	ld.param.u32 	%r6, [slice_dense_dense_f_param_8];
+	mov.u32 	%r8, %ctaid.x;
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r8, %r10;
+	div.s32 	%r2, %r1, %r6;
+	setp.lt.s32	%p1, %r2, %r7;
+	setp.gt.s32	%p2, %r6, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB7_2;
+	bra.uni 	BB7_1;
+
+BB7_1:
+	rem.s32 	%r11, %r1, %r6;
+	cvta.to.global.u64 	%rd3, %rd1;
+	add.s32 	%r12, %r2, %r3;
+	add.s32 	%r13, %r11, %r4;
+	mad.lo.s32 	%r14, %r12, %r5, %r13;
+	mul.wide.s32 	%rd4, %r14, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvta.to.global.u64 	%rd6, %rd2;
+	mul.wide.s32 	%rd7, %r1, 4;
+	add.s64 	%rd8, %rd6, %rd7;
+	st.global.f32 	[%rd8], %f1;
+
+BB7_2:
 	ret;
 }
 
-	// .globl	copy_u2l_dense
-.visible .entry copy_u2l_dense(
-	.param .u64 copy_u2l_dense_param_0,
-	.param .u32 copy_u2l_dense_param_1,
-	.param .u32 copy_u2l_dense_param_2
+	// .globl	copy_u2l_dense_d
+.visible .entry copy_u2l_dense_d(
+	.param .u64 copy_u2l_dense_d_param_0,
+	.param .u32 copy_u2l_dense_d_param_1,
+	.param .u32 copy_u2l_dense_d_param_2
 )
 {
 	.reg .pred 	%p<4>;
@@ -255,9 +547,9 @@ BB2_2:
 	.reg .b64 	%rd<7>;
 
 
-	ld.param.u64 	%rd1, [copy_u2l_dense_param_0];
-	ld.param.u32 	%r3, [copy_u2l_dense_param_1];
-	ld.param.u32 	%r4, [copy_u2l_dense_param_2];
+	ld.param.u64 	%rd1, [copy_u2l_dense_d_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_d_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_d_param_2];
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %ctaid.x;
 	mov.u32 	%r7, %tid.x;
@@ -268,10 +560,10 @@ BB2_2:
 	setp.gt.s32	%p1, %r9, %r8;
 	setp.lt.s32	%p2, %r2, %r4;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB3_2;
-	bra.uni 	BB3_1;
+	@!%p3 bra 	BB8_2;
+	bra.uni 	BB8_1;
 
-BB3_1:
+BB8_1:
 	cvta.to.global.u64 	%rd2, %rd1;
 	mul.wide.s32 	%rd3, %r1, 8;
 	add.s64 	%rd4, %rd2, %rd3;
@@ -280,16 +572,58 @@ BB3_1:
 	add.s64 	%rd6, %rd2, %rd5;
 	st.global.f64 	[%rd6], %fd1;
 
-BB3_2:
+BB8_2:
+	ret;
+}
+
+	// .globl	copy_u2l_dense_f
+.visible .entry copy_u2l_dense_f(
+	.param .u64 copy_u2l_dense_f_param_0,
+	.param .u32 copy_u2l_dense_f_param_1,
+	.param .u32 copy_u2l_dense_f_param_2
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<10>;
+	.reg .b64 	%rd<7>;
+
+
+	ld.param.u64 	%rd1, [copy_u2l_dense_f_param_0];
+	ld.param.u32 	%r3, [copy_u2l_dense_f_param_1];
+	ld.param.u32 	%r4, [copy_u2l_dense_f_param_2];
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %ctaid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r6, %r7;
+	div.s32 	%r8, %r1, %r3;
+	rem.s32 	%r9, %r1, %r3;
+	mad.lo.s32 	%r2, %r9, %r3, %r8;
+	setp.gt.s32	%p1, %r9, %r8;
+	setp.lt.s32	%p2, %r2, %r4;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB9_2;
+	bra.uni 	BB9_1;
+
+BB9_1:
+	cvta.to.global.u64 	%rd2, %rd1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	add.s64 	%rd4, %rd2, %rd3;
+	ld.global.f32 	%f1, [%rd4];
+	mul.wide.s32 	%rd5, %r2, 4;
+	add.s64 	%rd6, %rd2, %rd5;
+	st.global.f32 	[%rd6], %f1;
+
+BB9_2:
 	ret;
 }
 
-	// .globl	relu
-.visible .entry relu(
-	.param .u64 relu_param_0,
-	.param .u64 relu_param_1,
-	.param .u32 relu_param_2,
-	.param .u32 relu_param_3
+	// .globl	relu_d
+.visible .entry relu_d(
+	.param .u64 relu_d_param_0,
+	.param .u64 relu_d_param_1,
+	.param .u32 relu_d_param_2,
+	.param .u32 relu_d_param_3
 )
 {
 	.reg .pred 	%p<4>;
@@ -298,10 +632,10 @@ BB3_2:
 	.reg .b64 	%rd<8>;
 
 
-	ld.param.u64 	%rd1, [relu_param_0];
-	ld.param.u64 	%rd2, [relu_param_1];
-	ld.param.u32 	%r2, [relu_param_2];
-	ld.param.u32 	%r3, [relu_param_3];
+	ld.param.u64 	%rd1, [relu_d_param_0];
+	ld.param.u64 	%rd2, [relu_d_param_1];
+	ld.param.u32 	%r2, [relu_d_param_2];
+	ld.param.u32 	%r3, [relu_d_param_3];
 	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
@@ -310,10 +644,10 @@ BB3_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB4_2;
-	bra.uni 	BB4_1;
+	@!%p3 bra 	BB10_2;
+	bra.uni 	BB10_1;
 
-BB4_1:
+BB10_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -324,17 +658,64 @@ BB4_1:
 	add.s64 	%rd7, %rd6, %rd4;
 	st.global.f64 	[%rd7], %fd3;
 
-BB4_2:
+BB10_2:
+	ret;
+}
+
+	// .globl	relu_f
+.visible .entry relu_f(
+	.param .u64 relu_f_param_0,
+	.param .u64 relu_f_param_1,
+	.param .u32 relu_f_param_2,
+	.param .u32 relu_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<3>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [relu_f_param_0];
+	ld.param.u64 	%rd2, [relu_f_param_1];
+	ld.param.u32 	%r2, [relu_f_param_2];
+	ld.param.u32 	%r3, [relu_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB11_2;
+	bra.uni 	BB11_1;
+
+BB11_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.global.f32 	%f1, [%rd5];
+	cvt.f64.f32	%fd1, %f1;
+	mov.f64 	%fd2, 0d0000000000000000;
+	max.f64 	%fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f2, %fd3;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f32 	[%rd7], %f2;
+
+BB11_2:
 	ret;
 }
 
-	// .globl	relu_backward
-.visible .entry relu_backward(
-	.param .u64 relu_backward_param_0,
-	.param .u64 relu_backward_param_1,
-	.param .u64 relu_backward_param_2,
-	.param .u32 relu_backward_param_3,
-	.param .u32 relu_backward_param_4
+	// .globl	relu_backward_d
+.visible .entry relu_backward_d(
+	.param .u64 relu_backward_d_param_0,
+	.param .u64 relu_backward_d_param_1,
+	.param .u64 relu_backward_d_param_2,
+	.param .u32 relu_backward_d_param_3,
+	.param .u32 relu_backward_d_param_4
 )
 {
 	.reg .pred 	%p<5>;
@@ -343,11 +724,11 @@ BB4_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd2, [relu_backward_param_0];
-	ld.param.u64 	%rd3, [relu_backward_param_1];
-	ld.param.u64 	%rd4, [relu_backward_param_2];
-	ld.param.u32 	%r2, [relu_backward_param_3];
-	ld.param.u32 	%r3, [relu_backward_param_4];
+	ld.param.u64 	%rd2, [relu_backward_d_param_0];
+	ld.param.u64 	%rd3, [relu_backward_d_param_1];
+	ld.param.u64 	%rd4, [relu_backward_d_param_2];
+	ld.param.u32 	%r2, [relu_backward_d_param_3];
+	ld.param.u32 	%r3, [relu_backward_d_param_4];
 	mov.u32 	%r4, %ntid.x;
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %tid.x;
@@ -356,10 +737,10 @@ BB4_2:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB5_4;
-	bra.uni 	BB5_1;
+	@!%p3 bra 	BB12_4;
+	bra.uni 	BB12_1;
 
-BB5_1:
+BB12_1:
 	cvta.to.global.u64 	%rd5, %rd2;
 	cvt.s64.s32	%rd1, %r1;
 	mul.wide.s32 	%rd6, %r1, 8;
@@ -367,42 +748,98 @@ BB5_1:
 	ld.global.f64 	%fd4, [%rd7];
 	mov.f64 	%fd5, 0d0000000000000000;
 	setp.leu.f64	%p4, %fd4, 0d0000000000000000;
-	@%p4 bra 	BB5_3;
+	@%p4 bra 	BB12_3;
 
 	cvta.to.global.u64 	%rd8, %rd3;
 	shl.b64 	%rd9, %rd1, 3;
 	add.s64 	%rd10, %rd8, %rd9;
 	ld.global.f64 	%fd5, [%rd10];
 
-BB5_3:
+BB12_3:
 	cvta.to.global.u64 	%rd11, %rd4;
 	shl.b64 	%rd12, %rd1, 3;
 	add.s64 	%rd13, %rd11, %rd12;
 	st.global.f64 	[%rd13], %fd5;
 
-BB5_4:
+BB12_4:
 	ret;
 }
 
-	// .globl	inplace_add
-.visible .entry inplace_add(
-	.param .u64 inplace_add_param_0,
-	.param .u64 inplace_add_param_1,
-	.param .u32 inplace_add_param_2,
-	.param .u32 inplace_add_param_3
+	// .globl	relu_backward_f
+.visible .entry relu_backward_f(
+	.param .u64 relu_backward_f_param_0,
+	.param .u64 relu_backward_f_param_1,
+	.param .u64 relu_backward_f_param_2,
+	.param .u32 relu_backward_f_param_3,
+	.param .u32 relu_backward_f_param_4
 )
 {
-	.reg .pred 	%p<4>;
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
 	.reg .b32 	%r<8>;
-	.reg .f64 	%fd<4>;
-	.reg .b64 	%rd<8>;
+	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd1, [inplace_add_param_0];
-	ld.param.u64 	%rd2, [inplace_add_param_1];
-	ld.param.u32 	%r2, [inplace_add_param_2];
-	ld.param.u32 	%r3, [inplace_add_param_3];
-	mov.u32 	%r4, %ctaid.x;
+	ld.param.u64 	%rd2, [relu_backward_f_param_0];
+	ld.param.u64 	%rd3, [relu_backward_f_param_1];
+	ld.param.u64 	%rd4, [relu_backward_f_param_2];
+	ld.param.u32 	%r2, [relu_backward_f_param_3];
+	ld.param.u32 	%r3, [relu_backward_f_param_4];
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r4, %r5, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB13_4;
+	bra.uni 	BB13_1;
+
+BB13_1:
+	cvta.to.global.u64 	%rd5, %rd2;
+	cvt.s64.s32	%rd1, %r1;
+	mul.wide.s32 	%rd6, %r1, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f4, [%rd7];
+	mov.f32 	%f5, 0f00000000;
+	setp.leu.f32	%p4, %f4, 0f00000000;
+	@%p4 bra 	BB13_3;
+
+	cvta.to.global.u64 	%rd8, %rd3;
+	shl.b64 	%rd9, %rd1, 2;
+	add.s64 	%rd10, %rd8, %rd9;
+	ld.global.f32 	%f5, [%rd10];
+
+BB13_3:
+	cvta.to.global.u64 	%rd11, %rd4;
+	shl.b64 	%rd12, %rd1, 2;
+	add.s64 	%rd13, %rd11, %rd12;
+	st.global.f32 	[%rd13], %f5;
+
+BB13_4:
+	ret;
+}
+
+	// .globl	inplace_add_d
+.visible .entry inplace_add_d(
+	.param .u64 inplace_add_d_param_0,
+	.param .u64 inplace_add_d_param_1,
+	.param .u32 inplace_add_d_param_2,
+	.param .u32 inplace_add_d_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<8>;
+	.reg .f64 	%fd<4>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_d_param_0];
+	ld.param.u64 	%rd2, [inplace_add_d_param_1];
+	ld.param.u32 	%r2, [inplace_add_d_param_2];
+	ld.param.u32 	%r3, [inplace_add_d_param_3];
+	mov.u32 	%r4, %ctaid.x;
 	mov.u32 	%r5, %ntid.x;
 	mov.u32 	%r6, %tid.x;
 	mad.lo.s32 	%r1, %r5, %r4, %r6;
@@ -410,10 +847,10 @@ BB5_4:
 	setp.lt.s32	%p1, %r7, %r2;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB6_2;
-	bra.uni 	BB6_1;
+	@!%p3 bra 	BB14_2;
+	bra.uni 	BB14_1;
 
-BB6_1:
+BB14_1:
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 8;
 	add.s64 	%rd5, %rd3, %rd4;
@@ -424,18 +861,62 @@ BB6_1:
 	add.f64 	%fd3, %fd2, %fd1;
 	st.global.f64 	[%rd7], %fd3;
 
-BB6_2:
+BB14_2:
+	ret;
+}
+
+	// .globl	inplace_add_f
+.visible .entry inplace_add_f(
+	.param .u64 inplace_add_f_param_0,
+	.param .u64 inplace_add_f_param_1,
+	.param .u32 inplace_add_f_param_2,
+	.param .u32 inplace_add_f_param_3
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<8>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [inplace_add_f_param_0];
+	ld.param.u64 	%rd2, [inplace_add_f_param_1];
+	ld.param.u32 	%r2, [inplace_add_f_param_2];
+	ld.param.u32 	%r3, [inplace_add_f_param_3];
+	mov.u32 	%r4, %ctaid.x;
+	mov.u32 	%r5, %ntid.x;
+	mov.u32 	%r6, %tid.x;
+	mad.lo.s32 	%r1, %r5, %r4, %r6;
+	div.s32 	%r7, %r1, %r3;
+	setp.lt.s32	%p1, %r7, %r2;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB15_2;
+	bra.uni 	BB15_1;
+
+BB15_1:
+	cvta.to.global.u64 	%rd3, %rd1;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	cvta.to.global.u64 	%rd6, %rd2;
+	add.s64 	%rd7, %rd6, %rd4;
+	ld.global.f32 	%f1, [%rd7];
+	ld.global.f32 	%f2, [%rd5];
+	add.f32 	%f3, %f2, %f1;
+	st.global.f32 	[%rd7], %f3;
+
+BB15_2:
 	ret;
 }
 
-	// .globl	bias_add
-.visible .entry bias_add(
-	.param .u64 bias_add_param_0,
-	.param .u64 bias_add_param_1,
-	.param .u64 bias_add_param_2,
-	.param .u32 bias_add_param_3,
-	.param .u32 bias_add_param_4,
-	.param .u32 bias_add_param_5
+	// .globl	bias_add_d
+.visible .entry bias_add_d(
+	.param .u64 bias_add_d_param_0,
+	.param .u64 bias_add_d_param_1,
+	.param .u64 bias_add_d_param_2,
+	.param .u32 bias_add_d_param_3,
+	.param .u32 bias_add_d_param_4,
+	.param .u32 bias_add_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -444,12 +925,12 @@ BB6_2:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_add_param_0];
-	ld.param.u64 	%rd2, [bias_add_param_1];
-	ld.param.u64 	%rd3, [bias_add_param_2];
-	ld.param.u32 	%r4, [bias_add_param_3];
-	ld.param.u32 	%r2, [bias_add_param_4];
-	ld.param.u32 	%r3, [bias_add_param_5];
+	ld.param.u64 	%rd1, [bias_add_d_param_0];
+	ld.param.u64 	%rd2, [bias_add_d_param_1];
+	ld.param.u64 	%rd3, [bias_add_d_param_2];
+	ld.param.u32 	%r4, [bias_add_d_param_3];
+	ld.param.u32 	%r2, [bias_add_d_param_4];
+	ld.param.u32 	%r3, [bias_add_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -458,10 +939,10 @@ BB6_2:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB7_2;
-	bra.uni 	BB7_1;
+	@!%p3 bra 	BB16_2;
+	bra.uni 	BB16_1;
 
-BB7_1:
+BB16_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -477,20 +958,73 @@ BB7_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB7_2:
+BB16_2:
+	ret;
+}
+
+	// .globl	bias_add_f
+.visible .entry bias_add_f(
+	.param .u64 bias_add_f_param_0,
+	.param .u64 bias_add_f_param_1,
+	.param .u64 bias_add_f_param_2,
+	.param .u32 bias_add_f_param_3,
+	.param .u32 bias_add_f_param_4,
+	.param .u32 bias_add_f_param_5
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
+
+
+	ld.param.u64 	%rd1, [bias_add_f_param_0];
+	ld.param.u64 	%rd2, [bias_add_f_param_1];
+	ld.param.u64 	%rd3, [bias_add_f_param_2];
+	ld.param.u32 	%r4, [bias_add_f_param_3];
+	ld.param.u32 	%r2, [bias_add_f_param_4];
+	ld.param.u32 	%r3, [bias_add_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB17_2;
+	bra.uni 	BB17_1;
+
+BB17_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	add.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
+
+BB17_2:
 	ret;
 }
 
-	// .globl	daxpy_matrix_vector
-.visible .entry daxpy_matrix_vector(
-	.param .u64 daxpy_matrix_vector_param_0,
-	.param .u64 daxpy_matrix_vector_param_1,
-	.param .f64 daxpy_matrix_vector_param_2,
-	.param .u64 daxpy_matrix_vector_param_3,
-	.param .u32 daxpy_matrix_vector_param_4,
-	.param .u32 daxpy_matrix_vector_param_5,
-	.param .u32 daxpy_matrix_vector_param_6,
-	.param .u32 daxpy_matrix_vector_param_7
+	// .globl	daxpy_matrix_vector_d
+.visible .entry daxpy_matrix_vector_d(
+	.param .u64 daxpy_matrix_vector_d_param_0,
+	.param .u64 daxpy_matrix_vector_d_param_1,
+	.param .f64 daxpy_matrix_vector_d_param_2,
+	.param .u64 daxpy_matrix_vector_d_param_3,
+	.param .u32 daxpy_matrix_vector_d_param_4,
+	.param .u32 daxpy_matrix_vector_d_param_5,
+	.param .u32 daxpy_matrix_vector_d_param_6,
+	.param .u32 daxpy_matrix_vector_d_param_7
 )
 {
 	.reg .pred 	%p<5>;
@@ -499,13 +1033,13 @@ BB7_2:
 	.reg .b64 	%rd<14>;
 
 
-	ld.param.u64 	%rd3, [daxpy_matrix_vector_param_0];
-	ld.param.u64 	%rd5, [daxpy_matrix_vector_param_1];
-	ld.param.f64 	%fd2, [daxpy_matrix_vector_param_2];
-	ld.param.u64 	%rd4, [daxpy_matrix_vector_param_3];
-	ld.param.u32 	%r5, [daxpy_matrix_vector_param_4];
-	ld.param.u32 	%r3, [daxpy_matrix_vector_param_5];
-	ld.param.u32 	%r4, [daxpy_matrix_vector_param_6];
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_d_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_d_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_d_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_d_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_d_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_d_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_d_param_6];
 	cvta.to.global.u64 	%rd1, %rd5;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %ctaid.x;
@@ -516,10 +1050,10 @@ BB7_2:
 	setp.lt.s32	%p1, %r1, %r5;
 	setp.gt.s32	%p2, %r3, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB8_4;
-	bra.uni 	BB8_1;
+	@!%p3 bra 	BB18_4;
+	bra.uni 	BB18_1;
 
-BB8_1:
+BB18_1:
 	cvta.to.global.u64 	%rd6, %rd4;
 	mad.lo.s32 	%r10, %r1, %r3, %r2;
 	cvta.to.global.u64 	%rd7, %rd3;
@@ -528,36 +1062,111 @@ BB8_1:
 	ld.global.f64 	%fd1, [%rd9];
 	add.s64 	%rd2, %rd6, %rd8;
 	setp.eq.s32	%p4, %r4, 1;
-	@%p4 bra 	BB8_3;
-	bra.uni 	BB8_2;
+	@%p4 bra 	BB18_3;
+	bra.uni 	BB18_2;
 
-BB8_3:
+BB18_3:
 	mul.wide.s32 	%rd12, %r2, 8;
 	add.s64 	%rd13, %rd1, %rd12;
 	ld.global.f64 	%fd5, [%rd13];
 	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd6;
-	bra.uni 	BB8_4;
+	bra.uni 	BB18_4;
 
-BB8_2:
+BB18_2:
 	mul.wide.s32 	%rd10, %r1, 8;
 	add.s64 	%rd11, %rd1, %rd10;
 	ld.global.f64 	%fd3, [%rd11];
 	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
 	st.global.f64 	[%rd2], %fd4;
 
-BB8_4:
+BB18_4:
+	ret;
+}
+
+	// .globl	daxpy_matrix_vector_f
+.visible .entry daxpy_matrix_vector_f(
+	.param .u64 daxpy_matrix_vector_f_param_0,
+	.param .u64 daxpy_matrix_vector_f_param_1,
+	.param .f64 daxpy_matrix_vector_f_param_2,
+	.param .u64 daxpy_matrix_vector_f_param_3,
+	.param .u32 daxpy_matrix_vector_f_param_4,
+	.param .u32 daxpy_matrix_vector_f_param_5,
+	.param .u32 daxpy_matrix_vector_f_param_6,
+	.param .u32 daxpy_matrix_vector_f_param_7
+)
+{
+	.reg .pred 	%p<5>;
+	.reg .f32 	%f<6>;
+	.reg .b32 	%r<11>;
+	.reg .f64 	%fd<7>;
+	.reg .b64 	%rd<14>;
+
+
+	ld.param.u64 	%rd3, [daxpy_matrix_vector_f_param_0];
+	ld.param.u64 	%rd5, [daxpy_matrix_vector_f_param_1];
+	ld.param.f64 	%fd2, [daxpy_matrix_vector_f_param_2];
+	ld.param.u64 	%rd4, [daxpy_matrix_vector_f_param_3];
+	ld.param.u32 	%r5, [daxpy_matrix_vector_f_param_4];
+	ld.param.u32 	%r3, [daxpy_matrix_vector_f_param_5];
+	ld.param.u32 	%r4, [daxpy_matrix_vector_f_param_6];
+	cvta.to.global.u64 	%rd1, %rd5;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r9, %r6, %r7, %r8;
+	div.s32 	%r1, %r9, %r3;
+	rem.s32 	%r2, %r9, %r3;
+	setp.lt.s32	%p1, %r1, %r5;
+	setp.gt.s32	%p2, %r3, -1;
+	and.pred  	%p3, %p1, %p2;
+	@!%p3 bra 	BB19_4;
+	bra.uni 	BB19_1;
+
+BB19_1:
+	cvta.to.global.u64 	%rd6, %rd4;
+	mad.lo.s32 	%r10, %r1, %r3, %r2;
+	cvta.to.global.u64 	%rd7, %rd3;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	cvt.f64.f32	%fd1, %f1;
+	add.s64 	%rd2, %rd6, %rd8;
+	setp.eq.s32	%p4, %r4, 1;
+	@%p4 bra 	BB19_3;
+	bra.uni 	BB19_2;
+
+BB19_3:
+	mul.wide.s32 	%rd12, %r2, 4;
+	add.s64 	%rd13, %rd1, %rd12;
+	ld.global.f32 	%f4, [%rd13];
+	cvt.f64.f32	%fd5, %f4;
+	fma.rn.f64 	%fd6, %fd5, %fd2, %fd1;
+	cvt.rn.f32.f64	%f5, %fd6;
+	st.global.f32 	[%rd2], %f5;
+	bra.uni 	BB19_4;
+
+BB19_2:
+	mul.wide.s32 	%rd10, %r1, 4;
+	add.s64 	%rd11, %rd1, %rd10;
+	ld.global.f32 	%f2, [%rd11];
+	cvt.f64.f32	%fd3, %f2;
+	fma.rn.f64 	%fd4, %fd3, %fd2, %fd1;
+	cvt.rn.f32.f64	%f3, %fd4;
+	st.global.f32 	[%rd2], %f3;
+
+BB19_4:
 	ret;
 }
 
-	// .globl	bias_multiply
-.visible .entry bias_multiply(
-	.param .u64 bias_multiply_param_0,
-	.param .u64 bias_multiply_param_1,
-	.param .u64 bias_multiply_param_2,
-	.param .u32 bias_multiply_param_3,
-	.param .u32 bias_multiply_param_4,
-	.param .u32 bias_multiply_param_5
+	// .globl	bias_multiply_d
+.visible .entry bias_multiply_d(
+	.param .u64 bias_multiply_d_param_0,
+	.param .u64 bias_multiply_d_param_1,
+	.param .u64 bias_multiply_d_param_2,
+	.param .u32 bias_multiply_d_param_3,
+	.param .u32 bias_multiply_d_param_4,
+	.param .u32 bias_multiply_d_param_5
 )
 {
 	.reg .pred 	%p<4>;
@@ -566,12 +1175,12 @@ BB8_4:
 	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd1, [bias_multiply_param_0];
-	ld.param.u64 	%rd2, [bias_multiply_param_1];
-	ld.param.u64 	%rd3, [bias_multiply_param_2];
-	ld.param.u32 	%r4, [bias_multiply_param_3];
-	ld.param.u32 	%r2, [bias_multiply_param_4];
-	ld.param.u32 	%r3, [bias_multiply_param_5];
+	ld.param.u64 	%rd1, [bias_multiply_d_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_d_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_d_param_2];
+	ld.param.u32 	%r4, [bias_multiply_d_param_3];
+	ld.param.u32 	%r2, [bias_multiply_d_param_4];
+	ld.param.u32 	%r3, [bias_multiply_d_param_5];
 	mov.u32 	%r5, %ctaid.x;
 	mov.u32 	%r6, %ntid.x;
 	mov.u32 	%r7, %tid.x;
@@ -580,10 +1189,10 @@ BB8_4:
 	setp.lt.s32	%p1, %r8, %r4;
 	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB9_2;
-	bra.uni 	BB9_1;
+	@!%p3 bra 	BB20_2;
+	bra.uni 	BB20_1;
 
-BB9_1:
+BB20_1:
 	rem.s32 	%r9, %r1, %r2;
 	cvta.to.global.u64 	%rd4, %rd1;
 	mul.wide.s32 	%rd5, %r1, 8;
@@ -599,110 +1208,89 @@ BB9_1:
 	add.s64 	%rd11, %rd10, %rd5;
 	st.global.f64 	[%rd11], %fd3;
 
-BB9_2:
+BB20_2:
 	ret;
 }
 
-	// .globl	compare_and_set
-.visible .entry compare_and_set(
-	.param .u64 compare_and_set_param_0,
-	.param .u64 compare_and_set_param_1,
-	.param .u32 compare_and_set_param_2,
-	.param .u32 compare_and_set_param_3,
-	.param .f64 compare_and_set_param_4,
-	.param .f64 compare_and_set_param_5,
-	.param .f64 compare_and_set_param_6,
-	.param .f64 compare_and_set_param_7,
-	.param .f64 compare_and_set_param_8
+	// .globl	bias_multiply_f
+.visible .entry bias_multiply_f(
+	.param .u64 bias_multiply_f_param_0,
+	.param .u64 bias_multiply_f_param_1,
+	.param .u64 bias_multiply_f_param_2,
+	.param .u32 bias_multiply_f_param_3,
+	.param .u32 bias_multiply_f_param_4,
+	.param .u32 bias_multiply_f_param_5
 )
 {
-	.reg .pred 	%p<6>;
-	.reg .b32 	%r<10>;
-	.reg .f64 	%fd<9>;
-	.reg .b64 	%rd<8>;
+	.reg .pred 	%p<4>;
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<12>;
 
 
-	ld.param.u64 	%rd2, [compare_and_set_param_0];
-	ld.param.u64 	%rd3, [compare_and_set_param_1];
-	ld.param.u32 	%r2, [compare_and_set_param_2];
-	ld.param.u32 	%r3, [compare_and_set_param_3];
-	ld.param.f64 	%fd2, [compare_and_set_param_4];
-	ld.param.f64 	%fd3, [compare_and_set_param_5];
-	ld.param.f64 	%fd4, [compare_and_set_param_6];
-	ld.param.f64 	%fd5, [compare_and_set_param_7];
-	ld.param.f64 	%fd6, [compare_and_set_param_8];
-	mov.u32 	%r4, %ctaid.x;
-	mov.u32 	%r5, %ntid.x;
-	mov.u32 	%r6, %tid.x;
-	mad.lo.s32 	%r7, %r5, %r4, %r6;
-	div.s32 	%r8, %r7, %r3;
-	rem.s32 	%r9, %r7, %r3;
-	mad.lo.s32 	%r1, %r8, %r3, %r9;
-	setp.lt.s32	%p1, %r8, %r2;
-	setp.gt.s32	%p2, %r3, -1;
+	ld.param.u64 	%rd1, [bias_multiply_f_param_0];
+	ld.param.u64 	%rd2, [bias_multiply_f_param_1];
+	ld.param.u64 	%rd3, [bias_multiply_f_param_2];
+	ld.param.u32 	%r4, [bias_multiply_f_param_3];
+	ld.param.u32 	%r2, [bias_multiply_f_param_4];
+	ld.param.u32 	%r3, [bias_multiply_f_param_5];
+	mov.u32 	%r5, %ctaid.x;
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %tid.x;
+	mad.lo.s32 	%r1, %r6, %r5, %r7;
+	div.s32 	%r8, %r1, %r2;
+	setp.lt.s32	%p1, %r8, %r4;
+	setp.gt.s32	%p2, %r2, -1;
 	and.pred  	%p3, %p1, %p2;
-	@!%p3 bra 	BB10_6;
-	bra.uni 	BB10_1;
+	@!%p3 bra 	BB21_2;
+	bra.uni 	BB21_1;
 
-BB10_1:
-	cvta.to.global.u64 	%rd4, %rd2;
-	mul.wide.s32 	%rd5, %r1, 8;
+BB21_1:
+	rem.s32 	%r9, %r1, %r2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mul.wide.s32 	%rd5, %r1, 4;
 	add.s64 	%rd6, %rd4, %rd5;
-	ld.global.f64 	%fd1, [%rd6];
-	sub.f64 	%fd7, %fd1, %fd2;
-	abs.f64 	%fd8, %fd7;
-	setp.lt.f64	%p4, %fd8, %fd3;
-	cvta.to.global.u64 	%rd7, %rd3;
-	add.s64 	%rd1, %rd7, %rd5;
-	@%p4 bra 	BB10_5;
-	bra.uni 	BB10_2;
-
-BB10_5:
-	st.global.f64 	[%rd1], %fd4;
-	bra.uni 	BB10_6;
-
-BB10_2:
-	setp.lt.f64	%p5, %fd1, %fd2;
-	@%p5 bra 	BB10_4;
-	bra.uni 	BB10_3;
-
-BB10_4:
-	st.global.f64 	[%rd1], %fd5;
-	bra.uni 	BB10_6;
-
-BB10_3:
-	st.global.f64 	[%rd1], %fd6;
+	div.s32 	%r10, %r9, %r3;
+	cvta.to.global.u64 	%rd7, %rd2;
+	mul.wide.s32 	%rd8, %r10, 4;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f32 	%f1, [%rd9];
+	ld.global.f32 	%f2, [%rd6];
+	mul.f32 	%f3, %f2, %f1;
+	cvta.to.global.u64 	%rd10, %rd3;
+	add.s64 	%rd11, %rd10, %rd5;
+	st.global.f32 	[%rd11], %f3;
 
-BB10_6:
+BB21_2:
 	ret;
 }
 
-	// .globl	matrix_matrix_cellwise_op
-.visible .entry matrix_matrix_cellwise_op(
-	.param .u64 matrix_matrix_cellwise_op_param_0,
-	.param .u64 matrix_matrix_cellwise_op_param_1,
-	.param .u64 matrix_matrix_cellwise_op_param_2,
-	.param .u32 matrix_matrix_cellwise_op_param_3,
-	.param .u32 matrix_matrix_cellwise_op_param_4,
-	.param .u32 matrix_matrix_cellwise_op_param_5,
-	.param .u32 matrix_matrix_cellwise_op_param_6,
-	.param .u32 matrix_matrix_cellwise_op_param_7
+	// .globl	matrix_matrix_cellwise_op_d
+.visible .entry matrix_matrix_cellwise_op_d(
+	.param .u64 matrix_matrix_cellwise_op_d_param_0,
+	.param .u64 matrix_matrix_cellwise_op_d_param_1,
+	.param .u64 matrix_matrix_cellwise_op_d_param_2,
+	.param .u32 matrix_matrix_cellwise_op_d_param_3,
+	.param .u32 matrix_matrix_cellwise_op_d_param_4,
+	.param .u32 matrix_matrix_cellwise_op_d_param_5,
+	.param .u32 matrix_matrix_cellwise_op_d_param_6,
+	.param .u32 matrix_matrix_cellwise_op_d_param_7
 )
 {
-	.reg .pred 	%p<77>;
-	.reg .b32 	%r<65>;
-	.reg .f64 	%fd<55>;
+	.reg .pred 	%p<73>;
+	.reg .b32 	%r<66>;
+	.reg .f64 	%fd<56>;
 	.reg .b64 	%rd<19>;
 
 
-	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_param_0];
-	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_param_1];
-	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_param_2];
-	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_param_3];
-	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_param_4];
-	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_param_5];
-	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_param_6];
-	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_param_7];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_d_param_0];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_d_param_1];
+	ld.param.u64 	%rd4, [matrix_matrix_cellwise_op_d_param_2];
+	ld.param.u32 	%r14, [matrix_matrix_cellwise_op_d_param_3];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_d_param_4];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_d_param_5];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_d_param_6];
+	ld.param.u32 	%r13, [matrix_matrix_cellwise_op_d_param_7];
 	mov.u32 	%r15, %ntid.x;
 	mov.u32 	%r16, %ctaid.x;
 	mov.u32 	%r17, %tid.x;
@@ -712,93 +1300,93 @@ BB10_6:
 	setp.lt.s32	%p2, %r1, %r14;
 	setp.gt.s32	%p3, %r10, -1;
 	and.pred  	%p4, %p2, %p3;
-	@!%p4 bra 	BB11_73;
-	bra.uni 	BB11_1;
+	@!%p4 bra 	BB22_77;
+	bra.uni 	BB22_1;
 
-BB11_1:
+BB22_1:
 	mad.lo.s32 	%r3, %r1, %r10, %r2;
 	setp.eq.s32	%p5, %r11, 1;
-	mov.u32 	%r63, %r1;
-	@%p5 bra 	BB11_5;
+	mov.u32 	%r64, %r1;
+	@%p5 bra 	BB22_5;
 
 	setp.ne.s32	%p6, %r11, 2;
-	mov.u32 	%r64, %r3;
-	@%p6 bra 	BB11_4;
+	mov.u32 	%r65, %r3;
+	@%p6 bra 	BB22_4;
 
-	mov.u32 	%r64, %r2;
+	mov.u32 	%r65, %r2;
 
-BB11_4:
-	mov.u32 	%r58, %r64;
-	mov.u32 	%r4, %r58;
-	mov.u32 	%r63, %r4;
+BB22_4:
+	mov.u32 	%r59, %r65;
+	mov.u32 	%r4, %r59;
+	mov.u32 	%r64, %r4;
 
-BB11_5:
-	mov.u32 	%r5, %r63;
+BB22_5:
+	mov.u32 	%r5, %r64;
 	setp.eq.s32	%p7, %r12, 1;
-	mov.u32 	%r61, %r1;
-	@%p7 bra 	BB11_9;
+	mov.u32 	%r62, %r1;
+	@%p7 bra 	BB22_9;
 
 	setp.ne.s32	%p8, %r12, 2;
-	mov.u32 	%r62, %r3;
-	@%p8 bra 	BB11_8;
+	mov.u32 	%r63, %r3;
+	@%p8 bra 	BB22_8;
 
-	mov.u32 	%r62, %r2;
+	mov.u32 	%r63, %r2;
 
-BB11_8:
-	mov.u32 	%r61, %r62;
+BB22_8:
+	mov.u32 	%r62, %r63;
 
-BB11_9:
+BB22_9:
 	cvta.to.global.u64 	%rd5, %rd3;
 	cvta.to.global.u64 	%rd6, %rd2;
 	mul.wide.s32 	%rd7, %r5, 8;
 	add.s64 	%rd8, %rd6, %rd7;
 	ld.global.f64 	%fd1, [%rd8];
-	mul.wide.s32 	%rd9, %r61, 8;
+	mul.wide.s32 	%rd9, %r62, 8;
 	add.s64 	%rd10, %rd5, %rd9;
 	ld.global.f64 	%fd2, [%rd10];
-	mov.f64 	%fd54, 0d7FEFFFFFFFFFFFFF;
+	mov.f64 	%fd55, 0d7FEFFFFFFFFFFFFF;
 	setp.gt.s32	%p9, %r13, 8;
-	@%p9 bra 	BB11_26;
+	@%p9 bra 	BB22_26;
 
 	setp.gt.s32	%p23, %r13, 3;
-	@%p23 bra 	BB11_18;
+	@%p23 bra 	BB22_18;
 
 	setp.gt.s32	%p30, %r13, 1;
-	@%p30 bra 	BB11_15;
+	@%p30 bra 	BB22_15;
 
 	setp.eq.s32	%p33, %r13, 0;
-	@%p33 bra 	BB11_71;
-	bra.uni 	BB11_13;
+	@%p33 bra 	BB22_75;
+	bra.uni 	BB22_13;
 
-BB11_71:
-	add.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_75:
+	add.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_26:
+BB22_26:
 	setp.gt.s32	%p10, %r13, 13;
-	@%p10 bra 	BB11_35;
+	@%p10 bra 	BB22_35;
 
 	setp.gt.s32	%p17, %r13, 10;
-	@%p17 bra 	BB11_31;
+	@%p17 bra 	BB22_31;
 
 	setp.eq.s32	%p21, %r13, 9;
-	@%p21 bra 	BB11_53;
-	bra.uni 	BB11_29;
+	@%p21 bra 	BB22_55;
+	bra.uni 	BB22_29;
 
-BB11_53:
-	setp.eq.f64	%p50, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB11_72;
+BB22_55:
+	setp.eq.f64	%p48, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB22_76;
 
-BB11_18:
+BB22_18:
 	setp.gt.s32	%p24, %r13, 5;
-	@%p24 bra 	BB11_22;
+	@%p24 bra 	BB22_22;
 
 	setp.eq.s32	%p28, %r13, 4;
-	@%p28 bra 	BB11_56;
-	bra.uni 	BB11_20;
+	@%p28 bra 	BB22_58;
+	bra.uni 	BB22_20;
 
-BB11_56:
+BB22_58:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r8}, %fd1;
@@ -811,7 +1399,7 @@ BB11_56:
 	add.s32 	%r32, %r31, -1012;
 	mov.b64 	 %rd15, %fd2;
 	shl.b64 	%rd1, %rd15, %r32;
-	setp.eq.s64	%p55, %rd1, -9223372036854775808;
+	setp.eq.s64	%p53, %rd1, -9223372036854775808;
 	abs.f64 	%fd19, %fd1;
 	// Callseq Start 0
 	{
@@ -828,472 +1416,966 @@ BB11_56:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd53, [retval0+0];
+	ld.param.f64	%fd54, [retval0+0];
 	
 	//{
 	}// Callseq End 0
-	setp.lt.s32	%p56, %r8, 0;
-	and.pred  	%p1, %p56, %p55;
-	@!%p1 bra 	BB11_58;
-	bra.uni 	BB11_57;
+	setp.lt.s32	%p54, %r8, 0;
+	and.pred  	%p1, %p54, %p53;
+	@!%p1 bra 	BB22_60;
+	bra.uni 	BB22_59;
 
-BB11_57:
+BB22_59:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r33}, %fd53;
+	mov.b64 	{%temp, %r33}, %fd54;
 	}
 	xor.b32  	%r34, %r33, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r35, %temp}, %fd53;
+	mov.b64 	{%r35, %temp}, %fd54;
 	}
-	mov.b64 	%fd53, {%r35, %r34};
+	mov.b64 	%fd54, {%r35, %r34};
 
-BB11_58:
-	mov.f64 	%fd52, %fd53;
-	setp.eq.f64	%p57, %fd1, 0d0000000000000000;
-	@%p57 bra 	BB11_61;
-	bra.uni 	BB11_59;
+BB22_60:
+	mov.f64 	%fd53, %fd54;
+	setp.eq.f64	%p55, %fd1, 0d0000000000000000;
+	@%p55 bra 	BB22_63;
+	bra.uni 	BB22_61;
 
-BB11_61:
-	selp.b32	%r36, %r8, 0, %p55;
+BB22_63:
+	selp.b32	%r36, %r8, 0, %p53;
 	or.b32  	%r37, %r36, 2146435072;
-	setp.lt.s32	%p61, %r9, 0;
-	selp.b32	%r38, %r37, %r36, %p61;
+	setp.lt.s32	%p59, %r9, 0;
+	selp.b32	%r38, %r37, %r36, %p59;
 	mov.u32 	%r39, 0;
-	mov.b64 	%fd52, {%r39, %r38};
-	bra.uni 	BB11_62;
+	mov.b64 	%fd53, {%r39, %r38};
+	bra.uni 	BB22_64;
 
-BB11_35:
+BB22_35:
 	setp.gt.s32	%p11, %r13, 15;
-	@%p11 bra 	BB11_39;
+	@%p11 bra 	BB22_39;
 
 	setp.eq.s32	%p15, %r13, 14;
-	@%p15 bra 	BB11_50;
-	bra.uni 	BB11_37;
+	@%p15 bra 	BB22_52;
+	bra.uni 	BB22_37;
 
-BB11_50:
+BB22_52:
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.rni.s64.f64	%rd12, %fd2;
 	cvt.u32.u64	%r25, %rd11;
 	cvt.u32.u64	%r26, %rd12;
 	or.b32  	%r27, %r26, %r25;
-	setp.eq.s32	%p47, %r27, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p47;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p45, %r27, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
+	bra.uni 	BB22_76;
 
-BB11_15:
+BB22_15:
 	setp.eq.s32	%p31, %r13, 2;
-	@%p31 bra 	BB11_70;
-	bra.uni 	BB11_16;
+	@%p31 bra 	BB22_74;
+	bra.uni 	BB22_16;
 
-BB11_70:
-	mul.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_74:
+	mul.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_31:
+BB22_31:
 	setp.eq.s32	%p18, %r13, 11;
-	@%p18 bra 	BB11_52;
+	@%p18 bra 	BB22_54;
 
 	setp.eq.s32	%p19, %r13, 12;
-	@%p19 bra 	BB11_51;
-	bra.uni 	BB11_33;
+	@%p19 bra 	BB22_53;
+	bra.uni 	BB22_33;
 
-BB11_51:
-	max.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_53:
+	max.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_22:
+BB22_22:
 	setp.eq.s32	%p25, %r13, 6;
-	@%p25 bra 	BB11_55;
+	@%p25 bra 	BB22_57;
 
 	setp.eq.s32	%p26, %r13, 7;
-	@%p26 bra 	BB11_54;
-	bra.uni 	BB11_24;
+	@%p26 bra 	BB22_56;
+	bra.uni 	BB22_24;
 
-BB11_54:
-	setp.gt.f64	%p52, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p52;
-	bra.uni 	BB11_72;
+BB22_56:
+	setp.gt.f64	%p50, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p50;
+	bra.uni 	BB22_76;
 
-BB11_39:
+BB22_39:
 	setp.eq.s32	%p12, %r13, 16;
-	@%p12 bra 	BB11_49;
+	@%p12 bra 	BB22_51;
 
 	setp.eq.s32	%p13, %r13, 17;
-	@%p13 bra 	BB11_45;
-	bra.uni 	BB11_41;
+	@%p13 bra 	BB22_46;
+	bra.uni 	BB22_41;
 
-BB11_45:
-	setp.eq.f64	%p39, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p40, %fd2, 0d8000000000000000;
-	or.pred  	%p41, %p39, %p40;
-	mov.f64 	%fd54, 0d7FF8000000000000;
-	@%p41 bra 	BB11_72;
+BB22_46:
+	setp.eq.f64	%p38, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p39, %fd2, 0d8000000000000000;
+	or.pred  	%p40, %p38, %p39;
+	mov.f64 	%fd55, 0d7FF8000000000000;
+	@%p40 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd39, %fd54;
-	setp.gtu.f64	%p42, %fd39, 0d7FF0000000000000;
-	@%p42 bra 	BB11_72;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd39, %fd55;
+	setp.gtu.f64	%p41, %fd39, 0d7FF0000000000000;
+	@%p41 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r22, %temp}, %fd54;
+	mov.b64 	{%temp, %r22}, %fd55;
 	}
+	and.b32  	%r23, %r22, 2147483647;
+	setp.ne.s32	%p42, %r23, 2146435072;
+	@%p42 bra 	BB22_50;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r23}, %fd54;
+	mov.b64 	{%r24, %temp}, %fd55;
 	}
-	and.b32  	%r24, %r23, 2147483647;
-	setp.ne.s32	%p43, %r24, 2146435072;
-	setp.ne.s32	%p44, %r22, 0;
-	or.pred  	%p45, %p43, %p44;
-	@!%p45 bra 	BB11_72;
-	bra.uni 	BB11_48;
-
-BB11_48:
-	cvt.rmi.f64.f64	%fd40, %fd54;
+	setp.eq.s32	%p43, %r24, 0;
+	@%p43 bra 	BB22_76;
+
+BB22_50:
+	cvt.rmi.f64.f64	%fd40, %fd55;
 	mul.f64 	%fd41, %fd2, %fd40;
-	sub.f64 	%fd54, %fd1, %fd41;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd1, %fd41;
+	bra.uni 	BB22_76;
 
-BB11_13:
+BB22_13:
 	setp.eq.s32	%p34, %r13, 1;
-	@%p34 bra 	BB11_14;
-	bra.uni 	BB11_72;
+	@%p34 bra 	BB22_14;
+	bra.uni 	BB22_76;
 
-BB11_14:
-	sub.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_14:
+	sub.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_29:
+BB22_29:
 	setp.eq.s32	%p22, %r13, 10;
-	@%p22 bra 	BB11_30;
-	bra.uni 	BB11_72;
+	@%p22 bra 	BB22_30;
+	bra.uni 	BB22_76;
 
-BB11_30:
-	setp.neu.f64	%p49, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p49;
-	bra.uni 	BB11_72;
+BB22_30:
+	setp.neu.f64	%p47, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p47;
+	bra.uni 	BB22_76;
 
-BB11_20:
+BB22_20:
 	setp.eq.s32	%p29, %r13, 5;
-	@%p29 bra 	BB11_21;
-	bra.uni 	BB11_72;
+	@%p29 bra 	BB22_21;
+	bra.uni 	BB22_76;
 
-BB11_21:
-	setp.lt.f64	%p54, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p54;
-	bra.uni 	BB11_72;
+BB22_21:
+	setp.lt.f64	%p52, %fd1, %fd2;
+	selp.f64	%fd55, 0d3FF0000000000000, 0d0000000000000000, %p52;
+	bra.uni 	BB22_76;
 
-BB11_37:
+BB22_37:
 	setp.eq.s32	%p16, %r13, 15;
-	@%p16 bra 	BB11_38;
-	bra.uni 	BB11_72;
+	@%p16 bra 	BB22_38;
+	bra.uni 	BB22_76;
 
-BB11_38:
+BB22_38:
 	mul.f64 	%fd43, %fd1, %fd2;
 	mov.f64 	%fd44, 0d3FF0000000000000;
-	sub.f64 	%fd54, %fd44, %fd43;
-	bra.uni 	BB11_72;
+	sub.f64 	%fd55, %fd44, %fd43;
+	bra.uni 	BB22_76;
 
-BB11_16:
+BB22_16:
 	setp.eq.s32	%p32, %r13, 3;
-	@%p32 bra 	BB11_17;
-	bra.uni 	BB11_72;
+	@%p32 bra 	BB22_17;
+	bra.uni 	BB22_76;
 
-BB11_17:
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_17:
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_52:
-	min.f64 	%fd54, %fd1, %fd2;
-	bra.uni 	BB11_72;
+BB22_54:
+	min.f64 	%fd55, %fd1, %fd2;
+	bra.uni 	BB22_76;
 
-BB11_33:
+BB22_33:
 	setp.eq.s32	%p20, %r13, 13;
-	@%p20 bra 	BB11_34;
-	bra.uni 	BB11_72;
+	@%p20 bra 	BB22_34;
+	bra.uni 	BB22_76;
 
-BB11_34:
+BB22_34:
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.rni.s64.f64	%rd14, %fd2;
 	cvt.u32.u64	%r28, %rd13;
 	cvt.u32.u64	%r29, %rd14;
 	and.b32  	%r30, %r29, %r28;
-	setp.eq.s32	%p48, %r30, 0;
-	selp.f64	%fd54, 0d0000000000000000, 0d3FF0000000000000, %p48;
-	bra.uni 	BB11_72;
+	setp.eq.s32	%p46, %r30, 0;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
+	bra.uni 	BB22_76;
 
-BB11_55:
-	setp.le.f64	%p53, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p53;
-	bra.uni 	BB11_72;
+BB22_57:
+	setp.gtu.f64	%p51, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p51;
+	bra.uni 	BB22_76;
 
-BB11_24:
+BB22_24:
 	setp.eq.s32	%p27, %r13, 8;
-	@%p27 bra 	BB11_25;
-	bra.uni 	BB11_72;
+	@%p27 bra 	BB22_25;
+	bra.uni 	BB22_76;
 
-BB11_25:
-	setp.ge.f64	%p51, %fd1, %fd2;
-	selp.f64	%fd54, 0d3FF0000000000000, 0d0000000000000000, %p51;
-	bra.uni 	BB11_72;
+BB22_25:
+	setp.ltu.f64	%p49, %fd1, %fd2;
+	selp.f64	%fd55, 0d0000000000000000, 0d3FF0000000000000, %p49;
+	bra.uni 	BB22_76;
 
-BB11_49:
-	setp.neu.f64	%p46, %fd1, 0d0000000000000000;
+BB22_51:
+	setp.neu.f64	%p44, %fd1, 0d0000000000000000;
 	sub.f64 	%fd42, %fd1, %fd2;
-	selp.f64	%fd54, %fd42, 0d0000000000000000, %p46;
-	bra.uni 	BB11_72;
+	selp.f64	%fd55, %fd42, 0d0000000000000000, %p44;
+	bra.uni 	BB22_76;
 
-BB11_41:
+BB22_41:
 	setp.ne.s32	%p14, %r13, 18;
-	@%p14 bra 	BB11_72;
+	@%p14 bra 	BB22_76;
 
-	div.rn.f64 	%fd54, %fd1, %fd2;
-	abs.f64 	%fd37, %fd54;
+	div.rn.f64 	%fd55, %fd1, %fd2;
+	abs.f64 	%fd37, %fd55;
 	setp.gtu.f64	%p35, %fd37, 0d7FF0000000000000;
-	@%p35 bra 	BB11_72;
+	@%p35 bra 	BB22_76;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r19, %temp}, %fd54;
+	mov.b64 	{%temp, %r19}, %fd55;
 	}
+	and.b32  	%r20, %r19, 2147483647;
+	setp.ne.s32	%p36, %r20, 2146435072;
+	@%p36 bra 	BB22_45;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r20}, %fd54;
+	mov.b64 	{%r21, %temp}, %fd55;
 	}
-	and.b32  	%r21, %r20, 2147483647;
-	setp.ne.s32	%p36, %r21, 2146435072;
-	setp.ne.s32	%p37, %r19, 0;
-	or.pred  	%p38, %p36, %p37;
-	@!%p38 bra 	BB11_72;
-	bra.uni 	BB11_44;
+	setp.eq.s32	%p37, %r21, 0;
+	@%p37 bra 	BB22_76;
 
-BB11_44:
-	cvt.rmi.f64.f64	%fd54, %fd54;
-	bra.uni 	BB11_72;
+BB22_45:
+	cvt.rmi.f64.f64	%fd55, %fd55;
+	bra.uni 	BB22_76;
 
-BB11_59:
-	setp.gt.s32	%p58, %r8, -1;
-	@%p58 bra 	BB11_62;
+BB22_61:
+	setp.gt.s32	%p56, %r8, -1;
+	@%p56 bra 	BB22_64;
 
 	cvt.rzi.f64.f64	%fd45, %fd2;
-	setp.neu.f64	%p59, %fd45, %fd2;
-	selp.f64	%fd52, 0dFFF8000000000000, %fd52, %p59;
+	setp.neu.f64	%p57, %fd45, %fd2;
+	selp.f64	%fd53, 0dFFF8000000000000, %fd53, %p57;
 
-BB11_62:
-	mov.f64 	%fd25, %fd52;
+BB22_64:
+	mov.f64 	%fd25, %fd53;
 	add.f64 	%fd26, %fd1, %fd2;
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r40}, %fd26;
 	}
 	and.b32  	%r41, %r40, 2146435072;
-	setp.ne.s32	%p62, %r41, 2146435072;
-	mov.f64 	%fd51, %fd25;
-	@%p62 bra 	BB11_69;
+	setp.ne.s32	%p60, %r41, 2146435072;
+	mov.f64 	%fd52, %fd25;
+	@%p60 bra 	BB22_73;
 
-	setp.gtu.f64	%p63, %fd19, 0d7FF0000000000000;
-	mov.f64 	%fd51, %fd26;
-	@%p63 bra 	BB11_69;
+	setp.gtu.f64	%p61, %fd19, 0d7FF0000000000000;
+	mov.f64 	%fd52, %fd26;
+	@%p61 bra 	BB22_73;
 
 	abs.f64 	%fd46, %fd2;
-	setp.gtu.f64	%p64, %fd46, 0d7FF0000000000000;
-	mov.f64 	%fd50, %fd26;
-	mov.f64 	%fd51, %fd50;
-	@%p64 bra 	BB11_69;
+	setp.gtu.f64	%p62, %fd46, 0d7FF0000000000000;
+	mov.f64 	%fd51, %fd26;
+	mov.f64 	%fd52, %fd51;
+	@%p62 bra 	BB22_73;
+
+	and.b32  	%r42, %r9, 2147483647;
+	setp.ne.s32	%p63, %r42, 2146435072;
+	@%p63 bra 	BB22_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r42, %temp}, %fd2;
+	mov.b64 	{%r43, %temp}, %fd2;
 	}
-	and.b32  	%r43, %r9, 2147483647;
-	setp.eq.s32	%p65, %r43, 2146435072;
-	setp.eq.s32	%p66, %r42, 0;
-	and.pred  	%p67, %p65, %p66;
-	@%p67 bra 	BB11_68;
-	bra.uni 	BB11_66;
-
-BB11_68:
-	setp.gt.f64	%p71, %fd19, 0d3FF0000000000000;
-	selp.b32	%r51, 2146435072, 0, %p71;
-	xor.b32  	%r52, %r51, 2146435072;
-	setp.lt.s32	%p72, %r9, 0;
-	selp.b32	%r53, %r52, %r51, %p72;
-	setp.eq.f64	%p73, %fd1, 0dBFF0000000000000;
-	selp.b32	%r54, 1072693248, %r53, %p73;
-	mov.u32 	%r55, 0;
-	mov.b64 	%fd51, {%r55, %r54};
-	bra.uni 	BB11_69;
-
-BB11_66:
+	setp.eq.s32	%p64, %r43, 0;
+	@%p64 bra 	BB22_72;
+
+BB22_69:
+	and.b32  	%r44, %r8, 2147483647;
+	setp.ne.s32	%p65, %r44, 2146435072;
+	mov.f64 	%fd49, %fd25;
+	mov.f64 	%fd52, %fd49;
+	@%p65 bra 	BB22_73;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r44, %temp}, %fd1;
+	mov.b64 	{%r45, %temp}, %fd1;
 	}
-	and.b32  	%r45, %r8, 2147483647;
-	setp.eq.s32	%p68, %r45, 2146435072;
-	setp.eq.s32	%p69, %r44, 0;
-	and.pred  	%p70, %p68, %p69;
-	mov.f64 	%fd51, %fd25;
-	@!%p70 bra 	BB11_69;
-	bra.uni 	BB11_67;
-
-BB11_67:
+	setp.ne.s32	%p66, %r45, 0;
+	mov.f64 	%fd52, %fd25;
+	@%p66 bra 	BB22_73;
+
 	shr.s32 	%r46, %r9, 31;
 	and.b32  	%r47, %r46, -2146435072;
-	selp.b32	%r48, -1048576, 2146435072, %p1;
-	add.s32 	%r49, %r48, %r47;
-	mov.u32 	%r50, 0;
-	mov.b64 	%fd51, {%r50, %r49};
-
-BB11_69:
-	setp.eq.f64	%p74, %fd2, 0d0000000000000000;
-	setp.eq.f64	%p75, %fd1, 0d3FF0000000000000;
-	or.pred  	%p76, %p75, %p74;
-	selp.f64	%fd54, 0d3FF0000000000000, %fd51, %p76;
-
-BB11_72:
+	add.s32 	%r48, %r47, 2146435072;
+	or.b32  	%r49, %r48, -2147483648;
+	selp.b32	%r50, %r49, %r48, %p1;
+	mov.u32 	%r51, 0;
+	mov.b64 	%fd52, {%r51, %r50};
+	bra.uni 	BB22_73;
+
+BB22_72:
+	setp.gt.f64	%p67, %fd19, 0d3FF0000000000000;
+	selp.b32	%r52, 2146435072, 0, %p67;
+	xor.b32  	%r53, %r52, 2146435072;
+	setp.lt.s32	%p68, %r9, 0;
+	selp.b32	%r54, %r53, %r52, %p68;
+	setp.eq.f64	%p69, %fd1, 0dBFF0000000000000;
+	selp.b32	%r55, 1072693248, %r54, %p69;
+	mov.u32 	%r56, 0;
+	mov.b64 	%fd52, {%r56, %r55};
+
+BB22_73:
+	setp.eq.f64	%p70, %fd2, 0d0000000000000000;
+	setp.eq.f64	%p71, %fd1, 0d3FF0000000000000;
+	or.pred  	%p72, %p71, %p70;
+	selp.f64	%fd55, 0d3FF0000000000000, %fd52, %p72;
+
+BB22_76:
 	cvta.to.global.u64 	%rd16, %rd4;
 	mul.wide.s32 	%rd17, %r3, 8;
 	add.s64 	%rd18, %rd16, %rd17;
-	st.global.f64 	[%rd18], %fd54;
+	st.global.f64 	[%rd18], %fd55;
 	bar.sync 	0;
 
-BB11_73:
+BB22_77:
 	ret;
 }
 
-	// .globl	matrix_scalar_op
-.visible .entry matrix_scalar_op(
-	.param .u64 matrix_scalar_op_param_0,
-	.param .f64 matrix_scalar_op_param_1,
-	.param .u64 matrix_scalar_op_param_2,
-	.param .u32 matrix_scalar_op_param_3,
-	.param .u32 matrix_scalar_op_param_4,
-	.param .u32 matrix_scalar_op_param_5
+	// .globl	matrix_matrix_cellwise_op_f
+.visible .entry matrix_matrix_cellwise_op_f(
+	.param .u64 matrix_matrix_cellwise_op_f_param_0,
+	.param .u64 matrix_matrix_cellwise_op_f_param_1,
+	.param .u64 matrix_matrix_cellwise_op_f_param_2,
+	.param .u32 matrix_matrix_cellwise_op_f_param_3,
+	.param .u32 matrix_matrix_cellwise_op_f_param_4,
+	.param .u32 matrix_matrix_cellwise_op_f_param_5,
+	.param .u32 matrix_matrix_cellwise_op_f_param_6,
+	.param .u32 matrix_matrix_cellwise_op_f_param_7
 )
 {
-	.reg .pred 	%p<141>;
-	.reg .b32 	%r<86>;
-	.reg .f64 	%fd<107>;
-	.reg .b64 	%rd<20>;
+	.reg .pred 	%p<76>;
+	.reg .f32 	%f<134>;
+	.reg .b32 	%r<51>;
+	.reg .b64 	%rd<17>;
 
 
-	ld.param.u64 	%rd4, [matrix_scalar_op_param_0];
-	ld.param.f64 	%fd68, [matrix_scalar_op_param_1];
-	ld.param.u64 	%rd5, [matrix_scalar_op_param_2];
-	ld.param.u32 	%r8, [matrix_scalar_op_param_3];
-	ld.param.u32 	%r6, [matrix_scalar_op_param_4];
-	ld.param.u32 	%r7, [matrix_scalar_op_param_5];
-	mov.u32 	%r9, %ntid.x;
-	mov.u32 	%r10, %ctaid.x;
-	mov.u32 	%r11, %tid.x;
-	mad.lo.s32 	%r1, %r9, %r10, %r11;
-	setp.ge.s32	%p3, %r1, %r8;
-	@%p3 bra 	BB12_130;
+	ld.param.u64 	%rd1, [matrix_matrix_cellwise_op_f_param_0];
+	ld.param.u64 	%rd2, [matrix_matrix_cellwise_op_f_param_1];
+	ld.param.u64 	%rd3, [matrix_matrix_cellwise_op_f_param_2];
+	ld.param.u32 	%r12, [matrix_matrix_cellwise_op_f_param_3];
+	ld.param.u32 	%r8, [matrix_matrix_cellwise_op_f_param_4];
+	ld.param.u32 	%r9, [matrix_matrix_cellwise_op_f_param_5];
+	ld.param.u32 	%r10, [matrix_matrix_cellwise_op_f_param_6];
+	ld.param.u32 	%r11, [matrix_matrix_cellwise_op_f_param_7];
+	mov.u32 	%r13, %ntid.x;
+	mov.u32 	%r14, %ctaid.x;
+	mov.u32 	%r15, %tid.x;
+	mad.lo.s32 	%r16, %r13, %r14, %r15;
+	div.s32 	%r1, %r16, %r8;
+	rem.s32 	%r2, %r16, %r8;
+	setp.lt.s32	%p2, %r1, %r12;
+	setp.gt.s32	%p3, %r8, -1;
+	and.pred  	%p4, %p2, %p3;
+	@!%p4 bra 	BB23_71;
+	bra.uni 	BB23_1;
 
-	cvta.to.global.u64 	%rd6, %rd5;
-	cvta.to.global.u64 	%rd7, %rd4;
-	mul.wide.s32 	%rd8, %r1, 8;
-	add.s64 	%rd9, %rd7, %rd8;
-	ld.global.f64 	%fd1, [%rd9];
-	add.s64 	%rd1, %rd6, %rd8;
-	setp.eq.s32	%p4, %r7, 0;
-	@%p4 bra 	BB12_66;
+BB23_1:
+	mad.lo.s32 	%r3, %r1, %r8, %r2;
+	setp.eq.s32	%p5, %r9, 1;
+	mov.u32 	%r49, %r1;
+	@%p5 bra 	BB23_5;
 
-	mov.f64 	%fd98, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p5, %r6, 8;
-	@%p5 bra 	BB12_19;
+	setp.ne.s32	%p6, %r9, 2;
+	mov.u32 	%r50, %r3;
+	@%p6 bra 	BB23_4;
 
-	setp.gt.s32	%p19, %r6, 3;
-	@%p19 bra 	BB12_11;
+	mov.u32 	%r50, %r2;
 
-	setp.gt.s32	%p26, %r6, 1;
-	@%p26 bra 	BB12_8;
+BB23_4:
+	mov.u32 	%r44, %r50;
+	mov.u32 	%r4, %r44;
+	mov.u32 	%r49, %r4;
 
-	setp.eq.s32	%p29, %r6, 0;
-	@%p29 bra 	BB12_64;
-	bra.uni 	BB12_6;
+BB23_5:
+	mov.u32 	%r5, %r49;
+	setp.eq.s32	%p7, %r10, 1;
+	mov.u32 	%r47, %r1;
+	@%p7 bra 	BB23_9;
 
-BB12_64:
-	add.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+	setp.ne.s32	%p8, %r10, 2;
+	mov.u32 	%r48, %r3;
+	@%p8 bra 	BB23_8;
 
-BB12_66:
-	mov.f64 	%fd106, 0d7FEFFFFFFFFFFFFF;
-	setp.gt.s32	%p73, %r6, 8;
-	@%p73 bra 	BB12_83;
+	mov.u32 	%r48, %r2;
 
-	setp.gt.s32	%p87, %r6, 3;
-	@%p87 bra 	BB12_75;
+BB23_8:
+	mov.u32 	%r47, %r48;
 
-	setp.gt.s32	%p94, %r6, 1;
-	@%p94 bra 	BB12_72;
+BB23_9:
+	cvta.to.global.u64 	%rd4, %rd2;
+	cvta.to.global.u64 	%rd5, %rd1;
+	mul.wide.s32 	%rd6, %r5, 4;
+	add.s64 	%rd7, %rd5, %rd6;
+	ld.global.f32 	%f1, [%rd7];
+	mul.wide.s32 	%rd8, %r47, 4;
+	add.s64 	%rd9, %rd4, %rd8;
+	ld.global.f32 	%f2, [%rd9];
+	mov.f32 	%f133, 0f7F7FFFFF;
+	setp.gt.s32	%p9, %r11, 8;
+	@%p9 bra 	BB23_26;
 
-	setp.eq.s32	%p97, %r6, 0;
-	@%p97 bra 	BB12_128;
-	bra.uni 	BB12_70;
+	setp.gt.s32	%p23, %r11, 3;
+	@%p23 bra 	BB23_18;
 
-BB12_128:
-	add.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
+	setp.gt.s32	%p30, %r11, 1;
+	@%p30 bra 	BB23_15;
 
-BB12_19:
-	setp.gt.s32	%p6, %r6, 13;
-	@%p6 bra 	BB12_28;
+	setp.eq.s32	%p33, %r11, 0;
+	@%p33 bra 	BB23_69;
+	bra.uni 	BB23_13;
 
-	setp.gt.s32	%p13, %r6, 10;
-	@%p13 bra 	BB12_24;
+BB23_69:
+	add.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p17, %r6, 9;
-	@%p17 bra 	BB12_46;
-	bra.uni 	BB12_22;
+BB23_26:
+	setp.gt.s32	%p10, %r11, 13;
+	@%p10 bra 	BB23_35;
 
-BB12_46:
-	setp.eq.f64	%p46, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p46;
-	bra.uni 	BB12_65;
+	setp.gt.s32	%p17, %r11, 10;
+	@%p17 bra 	BB23_31;
 
-BB12_83:
-	setp.gt.s32	%p74, %r6, 13;
-	@%p74 bra 	BB12_92;
+	setp.eq.s32	%p21, %r11, 9;
+	@%p21 bra 	BB23_51;
+	bra.uni 	BB23_29;
+
+BB23_51:
+	setp.eq.f32	%p44, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p44;
+	bra.uni 	BB23_70;
+
+BB23_18:
+	setp.gt.s32	%p24, %r11, 5;
+	@%p24 bra 	BB23_22;
+
+	setp.eq.s32	%p28, %r11, 4;
+	@%p28 bra 	BB23_54;
+	bra.uni 	BB23_20;
+
+BB23_54:
+	mul.f32 	%f53, %f2, 0f3F000000;
+	cvt.rzi.f32.f32	%f54, %f53;
+	fma.rn.f32 	%f55, %f54, 0fC0000000, %f2;
+	abs.f32 	%f19, %f55;
+	abs.f32 	%f20, %f1;
+	setp.lt.f32	%p49, %f20, 0f00800000;
+	mul.f32 	%f56, %f20, 0f4B800000;
+	selp.f32	%f57, 0fC3170000, 0fC2FE0000, %p49;
+	selp.f32	%f58, %f56, %f20, %p49;
+	mov.b32 	 %r23, %f58;
+	and.b32  	%r24, %r23, 8388607;
+	or.b32  	%r25, %r24, 1065353216;
+	mov.b32 	 %f59, %r25;
+	shr.u32 	%r26, %r23, 23;
+	cvt.rn.f32.u32	%f60, %r26;
+	add.f32 	%f61, %f57, %f60;
+	setp.gt.f32	%p50, %f59, 0f3FB504F3;
+	mul.f32 	%f62, %f59, 0f3F000000;
+	add.f32 	%f63, %f61, 0f3F800000;
+	selp.f32	%f64, %f62, %f59, %p50;
+	selp.f32	%f65, %f63, %f61, %p50;
+	add.f32 	%f66, %f64, 0fBF800000;
+	add.f32 	%f50, %f64, 0f3F800000;
+	// inline asm
+	rcp.approx.ftz.f32 %f49,%f50;
+	// inline asm
+	add.f32 	%f67, %f66, %f66;
+	mul.f32 	%f68, %f49, %f67;
+	mul.f32 	%f69, %f68, %f68;
+	mov.f32 	%f70, 0f3C4CAF63;
+	mov.f32 	%f71, 0f3B18F0FE;
+	fma.rn.f32 	%f72, %f71, %f69, %f70;
+	mov.f32 	%f73, 0f3DAAAABD;
+	fma.rn.f32 	%f74, %f72, %f69, %f73;
+	mul.rn.f32 	%f75, %f74, %f69;
+	mul.rn.f32 	%f76, %f75, %f68;
+	sub.f32 	%f77, %f66, %f68;
+	neg.f32 	%f78, %f68;
+	add.f32 	%f79, %f77, %f77;
+	fma.rn.f32 	%f80, %f78, %f66, %f79;
+	mul.rn.f32 	%f81, %f49, %f80;
+	add.f32 	%f82, %f76, %f68;
+	sub.f32 	%f83, %f68, %f82;
+	add.f32 	%f84, %f76, %f83;
+	add.f32 	%f85, %f81, %f84;
+	add.f32 	%f86, %f82, %f85;
+	sub.f32 	%f87, %f82, %f86;
+	add.f32 	%f88, %f85, %f87;
+	mov.f32 	%f89, 0f3F317200;
+	mul.rn.f32 	%f90, %f65, %f89;
+	mov.f32 	%f91, 0f35BFBE8E;
+	mul.rn.f32 	%f92, %f65, %f91;
+	add.f32 	%f93, %f90, %f86;
+	sub.f32 	%f94, %f90, %f93;
+	add.f32 	%f95, %f86, %f94;
+	add.f32 	%f96, %f88, %f95;
+	add.f32 	%f97, %f92, %f96;
+	add.f32 	%f98, %f93, %f97;
+	sub.f32 	%f99, %f93, %f98;
+	add.f32 	%f100, %f97, %f99;
+	abs.f32 	%f21, %f2;
+	setp.gt.f32	%p51, %f21, 0f77F684DF;
+	mul.f32 	%f101, %f2, 0f39000000;
+	selp.f32	%f102, %f101, %f2, %p51;
+	mul.rn.f32 	%f103, %f102, %f98;
+	neg.f32 	%f104, %f103;
+	fma.rn.f32 	%f105, %f102, %f98, %f104;
+	fma.rn.f32 	%f106, %f102, %f100, %f105;
+	mov.f32 	%f107, 0f00000000;
+	fma.rn.f32 	%f108, %f107, %f98, %f106;
+	add.rn.f32 	%f109, %f103, %f108;
+	neg.f32 	%f110, %f109;
+	add.rn.f32 	%f111, %f103, %f110;
+	add.rn.f32 	%f112, %f111, %f108;
+	mov.b32 	 %r27, %f109;
+	setp.eq.s32	%p52, %r27, 1118925336;
+	add.s32 	%r28, %r27, -1;
+	mov.b32 	 %f113, %r28;
+	add.f32 	%f114, %f112, 0f37000000;
+	selp.f32	%f115, %f113, %f109, %p52;
+	selp.f32	%f22, %f114, %f112, %p52;
+	mul.f32 	%f116, %f115, 0f3FB8AA3B;
+	cvt.rzi.f32.f32	%f117, %f116;
+	mov.f32 	%f118, 0fBF317200;
+	fma.rn.f32 	%f119, %f117, %f118, %f115;
+	mov.f32 	%f120, 0fB5BFBE8E;
+	fma.rn.f32 	%f121, %f117, %f120, %f119;
+	mul.f32 	%f52, %f121, 0f3FB8AA3B;
+	// inline asm
+	ex2.approx.ftz.f32 %f51,%f52;
+	// inline asm
+	add.f32 	%f122, %f117, 0f00000000;
+	ex2.approx.f32 	%f123, %f122;
+	mul.f32 	%f124, %f51, %f123;
+	setp.lt.f32	%p53, %f115, 0fC2D20000;
+	selp.f32	%f125, 0f00000000, %f124, %p53;
+	setp.gt.f32	%p54, %f115, 0f42D20000;
+	selp.f32	%f131, 0f7F800000, %f125, %p54;
+	setp.eq.f32	%p55, %f131, 0f7F800000;
+	@%p55 bra 	BB23_56;
+
+	fma.rn.f32 	%f131, %f131, %f22, %f131;
+
+BB23_56:
+	setp.lt.f32	%p56, %f1, 0f00000000;
+	setp.eq.f32	%p57, %f19, 0f3F800000;
+	and.pred  	%p1, %p56, %p57;
+	mov.b32 	 %r29, %f131;
+	xor.b32  	%r30, %r29, -2147483648;
+	mov.b32 	 %f126, %r30;
+	selp.f32	%f132, %f126, %f131, %p1;
+	setp.eq.f32	%p58, %f1, 0f00000000;
+	@%p58 bra 	BB23_59;
+	bra.uni 	BB23_57;
+
+BB23_59:
+	add.f32 	%f128, %f1, %f1;
+	mov.b32 	 %r31, %f128;
+	selp.b32	%r32, %r31, 0, %p57;
+	or.b32  	%r33, %r32, 2139095040;
+	setp.lt.f32	%p62, %f2, 0f00000000;
+	selp.b32	%r34, %r33, %r32, %p62;
+	mov.b32 	 %f132, %r34;
+	bra.uni 	BB23_60;
 
-	setp.gt.s32	%p81, %r6, 10;
-	@%p81 bra 	BB12_88;
+BB23_35:
+	setp.gt.s32	%p11, %r11, 15;
+	@%p11 bra 	BB23_39;
+
+	setp.eq.s32	%p15, %r11, 14;
+	@%p15 bra 	BB23_48;
+	bra.uni 	BB23_37;
+
+BB23_48:
+	cvt.rni.s64.f32	%rd10, %f1;
+	cvt.rni.s64.f32	%rd11, %f2;
+	cvt.u32.u64	%r17, %rd10;
+	cvt.u32.u64	%r18, %rd11;
+	or.b32  	%r19, %r18, %r17;
+	setp.eq.s32	%p41, %r19, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p41;
+	bra.uni 	BB23_70;
 
-	setp.eq.s32	%p85, %r6, 9;
-	@%p85 bra 	BB12_110;
-	bra.uni 	BB12_86;
+BB23_15:
+	setp.eq.s32	%p31, %r11, 2;
+	@%p31 bra 	BB23_68;
+	bra.uni 	BB23_16;
 
-BB12_110:
-	setp.eq.f64	%p114, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p114;
-	bra.uni 	BB12_129;
+BB23_68:
+	mul.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
 
-BB12_11:
-	setp.gt.s32	%p20, %r6, 5;
-	@%p20 bra 	BB12_15;
+BB23_31:
+	setp.eq.s32	%p18, %r11, 11;
+	@%p18 bra 	BB23_50;
 
-	setp.eq.s32	%p24, %r6, 4;
-	@%p24 bra 	BB12_49;
-	bra.uni 	BB12_13;
+	setp.eq.s32	%p19, %r11, 12;
+	@%p19 bra 	BB23_49;
+	bra.uni 	BB23_33;
 
-BB12_49:
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r2}, %fd68;
-	}
-	{
-	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r3}, %fd1;
-	}
-	bfe.u32 	%r24, %r3, 20, 11;
+BB23_49:
+	max.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_22:
+	setp.eq.s32	%p25, %r11, 6;
+	@%p25 bra 	BB23_53;
+
+	setp.eq.s32	%p26, %r11, 7;
+	@%p26 bra 	BB23_52;
+	bra.uni 	BB23_24;
+
+BB23_52:
+	setp.gt.f32	%p46, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p46;
+	bra.uni 	BB23_70;
+
+BB23_39:
+	setp.eq.s32	%p12, %r11, 16;
+	@%p12 bra 	BB23_47;
+
+	setp.eq.s32	%p13, %r11, 17;
+	@%p13 bra 	BB23_44;
+	bra.uni 	BB23_41;
+
+BB23_44:
+	setp.eq.f32	%p36, %f2, 0f00000000;
+	setp.eq.f32	%p37, %f2, 0f80000000;
+	or.pred  	%p38, %p36, %p37;
+	mov.f32 	%f133, 0f7FC00000;
+	@%p38 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f43, %f133;
+	setp.geu.f32	%p39, %f43, 0f7F800000;
+	@%p39 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f44, %f133;
+	mul.f32 	%f45, %f2, %f44;
+	sub.f32 	%f133, %f1, %f45;
+	bra.uni 	BB23_70;
+
+BB23_13:
+	setp.eq.s32	%p34, %r11, 1;
+	@%p34 bra 	BB23_14;
+	bra.uni 	BB23_70;
+
+BB23_14:
+	sub.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_29:
+	setp.eq.s32	%p22, %r11, 10;
+	@%p22 bra 	BB23_30;
+	bra.uni 	BB23_70;
+
+BB23_30:
+	setp.neu.f32	%p43, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p43;
+	bra.uni 	BB23_70;
+
+BB23_20:
+	setp.eq.s32	%p29, %r11, 5;
+	@%p29 bra 	BB23_21;
+	bra.uni 	BB23_70;
+
+BB23_21:
+	setp.lt.f32	%p48, %f1, %f2;
+	selp.f32	%f133, 0f3F800000, 0f00000000, %p48;
+	bra.uni 	BB23_70;
+
+BB23_37:
+	setp.eq.s32	%p16, %r11, 15;
+	@%p16 bra 	BB23_38;
+	bra.uni 	BB23_70;
+
+BB23_38:
+	mul.f32 	%f47, %f1, %f2;
+	mov.f32 	%f48, 0f3F800000;
+	sub.f32 	%f133, %f48, %f47;
+	bra.uni 	BB23_70;
+
+BB23_16:
+	setp.eq.s32	%p32, %r11, 3;
+	@%p32 bra 	BB23_17;
+	bra.uni 	BB23_70;
+
+BB23_17:
+	div.rn.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_50:
+	min.f32 	%f133, %f1, %f2;
+	bra.uni 	BB23_70;
+
+BB23_33:
+	setp.eq.s32	%p20, %r11, 13;
+	@%p20 bra 	BB23_34;
+	bra.uni 	BB23_70;
+
+BB23_34:
+	cvt.rni.s64.f32	%rd12, %f1;
+	cvt.rni.s64.f32	%rd13, %f2;
+	cvt.u32.u64	%r20, %rd12;
+	cvt.u32.u64	%r21, %rd13;
+	and.b32  	%r22, %r21, %r20;
+	setp.eq.s32	%p42, %r22, 0;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p42;
+	bra.uni 	BB23_70;
+
+BB23_53:
+	setp.gtu.f32	%p47, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p47;
+	bra.uni 	BB23_70;
+
+BB23_24:
+	setp.eq.s32	%p27, %r11, 8;
+	@%p27 bra 	BB23_25;
+	bra.uni 	BB23_70;
+
+BB23_25:
+	setp.ltu.f32	%p45, %f1, %f2;
+	selp.f32	%f133, 0f00000000, 0f3F800000, %p45;
+	bra.uni 	BB23_70;
+
+BB23_47:
+	setp.neu.f32	%p40, %f1, 0f00000000;
+	sub.f32 	%f46, %f1, %f2;
+	selp.f32	%f133, %f46, 0f00000000, %p40;
+	bra.uni 	BB23_70;
+
+BB23_41:
+	setp.ne.s32	%p14, %r11, 18;
+	@%p14 bra 	BB23_70;
+
+	div.rn.f32 	%f133, %f1, %f2;
+	abs.f32 	%f41, %f133;
+	setp.geu.f32	%p35, %f41, 0f7F800000;
+	@%p35 bra 	BB23_70;
+
+	cvt.rmi.f32.f32	%f133, %f133;
+	bra.uni 	BB23_70;
+
+BB23_57:
+	setp.geu.f32	%p59, %f1, 0f00000000;
+	@%p59 bra 	BB23_60;
+
+	cvt.rzi.f32.f32	%f127, %f2;
+	setp.neu.f32	%p60, %f127, %f2;
+	selp.f32	%f132, 0f7FFFFFFF, %f132, %p60;
+
+BB23_60:
+	add.f32 	%f129, %f20, %f21;
+	mov.b32 	 %r35, %f129;
+	setp.lt.s32	%p63, %r35, 2139095040;
+	@%p63 bra 	BB23_67;
+
+	setp.gtu.f32	%p64, %f20, 0f7F800000;
+	setp.gtu.f32	%p65, %f21, 0f7F800000;
+	or.pred  	%p66, %p64, %p65;
+	@%p66 bra 	BB23_66;
+	bra.uni 	BB23_62;
+
+BB23_66:
+	add.f32 	%f132, %f1, %f2;
+	bra.uni 	BB23_67;
+
+BB23_62:
+	setp.eq.f32	%p67, %f21, 0f7F800000;
+	@%p67 bra 	BB23_65;
+	bra.uni 	BB23_63;
+
+BB23_65:
+	setp.gt.f32	%p70, %f20, 0f3F800000;
+	selp.b32	%r39, 2139095040, 0, %p70;
+	xor.b32  	%r40, %r39, 2139095040;
+	setp.lt.f32	%p71, %f2, 0f00000000;
+	selp.b32	%r41, %r40, %r39, %p71;
+	mov.b32 	 %f130, %r41;
+	setp.eq.f32	%p72, %f1, 0fBF800000;
+	selp.f32	%f132, 0f3F800000, %f130, %p72;
+	bra.uni 	BB23_67;
+
+BB23_63:
+	setp.neu.f32	%p68, %f20, 0f7F800000;
+	@%p68 bra 	BB23_67;
+
+	setp.ltu.f32	%p69, %f2, 0f00000000;
+	selp.b32	%r36, 0, 2139095040, %p69;
+	or.b32  	%r37, %r36, -2147483648;
+	selp.b32	%r38, %r37, %r36, %p1;
+	mov.b32 	 %f132, %r38;
+
+BB23_67:
+	setp.eq.f32	%p73, %f2, 0f00000000;
+	setp.eq.f32	%p74, %f1, 0f3F800000;
+	or.pred  	%p75, %p74, %p73;
+	selp.f32	%f133, 0f3F800000, %f132, %p75;
+
+BB23_70:
+	cvta.to.global.u64 	%rd14, %rd3;
+	mul.wide.s32 	%rd15, %r3, 4;
+	add.s64 	%rd16, %rd14, %rd15;
+	st.global.f32 	[%rd16], %f133;
+	bar.sync 	0;
+
+BB23_71:
+	ret;
+}
+
+	// .globl	matrix_scalar_op_d
+.visible .entry matrix_scalar_op_d(
+	.param .u64 matrix_scalar_op_d_param_0,
+	.param .f64 matrix_scalar_op_d_param_1,
+	.param .u64 matrix_scalar_op_d_param_2,
+	.param .u32 matrix_scalar_op_d_param_3,
+	.param .u32 matrix_scalar_op_d_param_4,
+	.param .u32 matrix_scalar_op_d_param_5
+)
+{
+	.reg .pred 	%p<133>;
+	.reg .b32 	%r<88>;
+	.reg .f64 	%fd<109>;
+	.reg .b64 	%rd<20>;
+
+
+	ld.param.u64 	%rd4, [matrix_scalar_op_d_param_0];
+	ld.param.f64 	%fd68, [matrix_scalar_op_d_param_1];
+	ld.param.u64 	%rd5, [matrix_scalar_op_d_param_2];
+	ld.param.u32 	%r8, [matrix_scalar_op_d_param_3];
+	ld.param.u32 	%r6, [matrix_scalar_op_d_param_4];
+	ld.param.u32 	%r7, [matrix_scalar_op_d_param_5];
+	mov.u32 	%r9, %ntid.x;
+	mov.u32 	%r10, %ctaid.x;
+	mov.u32 	%r11, %tid.x;
+	mad.lo.s32 	%r1, %r9, %r10, %r11;
+	setp.ge.s32	%p3, %r1, %r8;
+	@%p3 bra 	BB24_138;
+
+	cvta.to.global.u64 	%rd6, %rd5;
+	cvta.to.global.u64 	%rd7, %rd4;
+	mul.wide.s32 	%rd8, %r1, 8;
+	add.s64 	%rd9, %rd7, %rd8;
+	ld.global.f64 	%fd1, [%rd9];
+	add.s64 	%rd1, %rd6, %rd8;
+	setp.eq.s32	%p4, %r7, 0;
+	@%p4 bra 	BB24_70;
+
+	mov.f64 	%fd99, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p5, %r6, 8;
+	@%p5 bra 	BB24_19;
+
+	setp.gt.s32	%p19, %r6, 3;
+	@%p19 bra 	BB24_11;
+
+	setp.gt.s32	%p26, %r6, 1;
+	@%p26 bra 	BB24_8;
+
+	setp.eq.s32	%p29, %r6, 0;
+	@%p29 bra 	BB24_68;
+	bra.uni 	BB24_6;
+
+BB24_68:
+	add.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
+
+BB24_70:
+	mov.f64 	%fd108, 0d7FEFFFFFFFFFFFFF;
+	setp.gt.s32	%p69, %r6, 8;
+	@%p69 bra 	BB24_87;
+
+	setp.gt.s32	%p83, %r6, 3;
+	@%p83 bra 	BB24_79;
+
+	setp.gt.s32	%p90, %r6, 1;
+	@%p90 bra 	BB24_76;
+
+	setp.eq.s32	%p93, %r6, 0;
+	@%p93 bra 	BB24_136;
+	bra.uni 	BB24_74;
+
+BB24_136:
+	add.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_19:
+	setp.gt.s32	%p6, %r6, 13;
+	@%p6 bra 	BB24_28;
+
+	setp.gt.s32	%p13, %r6, 10;
+	@%p13 bra 	BB24_24;
+
+	setp.eq.s32	%p17, %r6, 9;
+	@%p17 bra 	BB24_48;
+	bra.uni 	BB24_22;
+
+BB24_48:
+	setp.eq.f64	%p44, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p44;
+	bra.uni 	BB24_69;
+
+BB24_87:
+	setp.gt.s32	%p70, %r6, 13;
+	@%p70 bra 	BB24_96;
+
+	setp.gt.s32	%p77, %r6, 10;
+	@%p77 bra 	BB24_92;
+
+	setp.eq.s32	%p81, %r6, 9;
+	@%p81 bra 	BB24_116;
+	bra.uni 	BB24_90;
+
+BB24_116:
+	setp.eq.f64	%p108, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p108;
+	bra.uni 	BB24_137;
+
+BB24_11:
+	setp.gt.s32	%p20, %r6, 5;
+	@%p20 bra 	BB24_15;
+
+	setp.eq.s32	%p24, %r6, 4;
+	@%p24 bra 	BB24_51;
+	bra.uni 	BB24_13;
+
+BB24_51:
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r2}, %fd68;
+	}
+	{
+	.reg .b32 %temp; 
+	mov.b64 	{%temp, %r3}, %fd1;
+	}
+	bfe.u32 	%r24, %r3, 20, 11;
 	add.s32 	%r25, %r24, -1012;
 	mov.b64 	 %rd14, %fd1;
 	shl.b64 	%rd2, %rd14, %r25;
-	setp.eq.s64	%p51, %rd2, -9223372036854775808;
+	setp.eq.s64	%p49, %rd2, -9223372036854775808;
 	abs.f64 	%fd18, %fd68;
 	// Callseq Start 1
 	{
@@ -1310,69 +2392,69 @@ BB12_49:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd97, [retval0+0];
+	ld.param.f64	%fd98, [retval0+0];
 	
 	//{
 	}// Callseq End 1
-	setp.lt.s32	%p52, %r2, 0;
-	and.pred  	%p1, %p52, %p51;
-	@!%p1 bra 	BB12_51;
-	bra.uni 	BB12_50;
+	setp.lt.s32	%p50, %r2, 0;
+	and.pred  	%p1, %p50, %p49;
+	@!%p1 bra 	BB24_53;
+	bra.uni 	BB24_52;
 
-BB12_50:
+BB24_52:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r26}, %fd97;
+	mov.b64 	{%temp, %r26}, %fd98;
 	}
 	xor.b32  	%r27, %r26, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r28, %temp}, %fd97;
+	mov.b64 	{%r28, %temp}, %fd98;
 	}
-	mov.b64 	%fd97, {%r28, %r27};
+	mov.b64 	%fd98, {%r28, %r27};
 
-BB12_51:
-	mov.f64 	%fd96, %fd97;
-	setp.eq.f64	%p53, %fd68, 0d0000000000000000;
-	@%p53 bra 	BB12_54;
-	bra.uni 	BB12_52;
+BB24_53:
+	mov.f64 	%fd97, %fd98;
+	setp.eq.f64	%p51, %fd68, 0d0000000000000000;
+	@%p51 bra 	BB24_56;
+	bra.uni 	BB24_54;
 
-BB12_54:
-	selp.b32	%r29, %r2, 0, %p51;
+BB24_56:
+	selp.b32	%r29, %r2, 0, %p49;
 	or.b32  	%r30, %r29, 2146435072;
-	setp.lt.s32	%p57, %r3, 0;
-	selp.b32	%r31, %r30, %r29, %p57;
+	setp.lt.s32	%p55, %r3, 0;
+	selp.b32	%r31, %r30, %r29, %p55;
 	mov.u32 	%r32, 0;
-	mov.b64 	%fd96, {%r32, %r31};
-	bra.uni 	BB12_55;
+	mov.b64 	%fd97, {%r32, %r31};
+	bra.uni 	BB24_57;
 
-BB12_28:
+BB24_28:
 	setp.gt.s32	%p7, %r6, 15;
-	@%p7 bra 	BB12_32;
+	@%p7 bra 	BB24_32;
 
 	setp.eq.s32	%p11, %r6, 14;
-	@%p11 bra 	BB12_43;
-	bra.uni 	BB12_30;
+	@%p11 bra 	BB24_45;
+	bra.uni 	BB24_30;
 
-BB12_43:
+BB24_45:
 	cvt.rni.s64.f64	%rd10, %fd68;
 	cvt.rni.s64.f64	%rd11, %fd1;
 	cvt.u32.u64	%r18, %rd10;
 	cvt.u32.u64	%r19, %rd11;
 	or.b32  	%r20, %r19, %r18;
-	setp.eq.s32	%p43, %r20, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p43;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p41, %r20, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p41;
+	bra.uni 	BB24_69;
 
-BB12_75:
-	setp.gt.s32	%p88, %r6, 5;
-	@%p88 bra 	BB12_79;
+BB24_79:
+	setp.gt.s32	%p84, %r6, 5;
+	@%p84 bra 	BB24_83;
 
-	setp.eq.s32	%p92, %r6, 4;
-	@%p92 bra 	BB12_113;
-	bra.uni 	BB12_77;
+	setp.eq.s32	%p88, %r6, 4;
+	@%p88 bra 	BB24_119;
+	bra.uni 	BB24_81;
 
-BB12_113:
+BB24_119:
 	{
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r4}, %fd1;
@@ -1381,11 +2463,11 @@ BB12_113:
 	.reg .b32 %temp; 
 	mov.b64 	{%temp, %r5}, %fd68;
 	}
-	bfe.u32 	%r61, %r5, 20, 11;
-	add.s32 	%r62, %r61, -1012;
+	bfe.u32 	%r62, %r5, 20, 11;
+	add.s32 	%r63, %r62, -1012;
 	mov.b64 	 %rd19, %fd68;
-	shl.b64 	%rd3, %rd19, %r62;
-	setp.eq.s64	%p119, %rd3, -9223372036854775808;
+	shl.b64 	%rd3, %rd19, %r63;
+	setp.eq.s64	%p113, %rd3, -9223372036854775808;
 	abs.f64 	%fd51, %fd1;
 	// Callseq Start 2
 	{
@@ -1402,621 +2484,1482 @@ BB12_113:
 	param0, 
 	param1
 	);
-	ld.param.f64	%fd105, [retval0+0];
+	ld.param.f64	%fd107, [retval0+0];
 	
 	//{
 	}// Callseq End 2
-	setp.lt.s32	%p120, %r4, 0;
-	and.pred  	%p2, %p120, %p119;
-	@!%p2 bra 	BB12_115;
-	bra.uni 	BB12_114;
+	setp.lt.s32	%p114, %r4, 0;
+	and.pred  	%p2, %p114, %p113;
+	@!%p2 bra 	BB24_121;
+	bra.uni 	BB24_120;
 
-BB12_114:
+BB24_120:
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r63}, %fd105;
+	mov.b64 	{%temp, %r64}, %fd107;
 	}
-	xor.b32  	%r64, %r63, -2147483648;
+	xor.b32  	%r65, %r64, -2147483648;
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r65, %temp}, %fd105;
+	mov.b64 	{%r66, %temp}, %fd107;
 	}
-	mov.b64 	%fd105, {%r65, %r64};
-
-BB12_115:
-	mov.f64 	%fd104, %fd105;
-	setp.eq.f64	%p121, %fd1, 0d0000000000000000;
-	@%p121 bra 	BB12_118;
-	bra.uni 	BB12_116;
-
-BB12_118:
-	selp.b32	%r66, %r4, 0, %p119;
-	or.b32  	%r67, %r66, 2146435072;
-	setp.lt.s32	%p125, %r5, 0;
-	selp.b32	%r68, %r67, %r66, %p125;
-	mov.u32 	%r69, 0;
-	mov.b64 	%fd104, {%r69, %r68};
-	bra.uni 	BB12_119;
-
-BB12_92:
-	setp.gt.s32	%p75, %r6, 15;
-	@%p75 bra 	BB12_96;
-
-	setp.eq.s32	%p79, %r6, 14;
-	@%p79 bra 	BB12_107;
-	bra.uni 	BB12_94;
-
-BB12_107:
+	mov.b64 	%fd107, {%r66, %r65};
+
+BB24_121:
+	mov.f64 	%fd106, %fd107;
+	setp.eq.f64	%p115, %fd1, 0d0000000000000000;
+	@%p115 bra 	BB24_124;
+	bra.uni 	BB24_122;
+
+BB24_124:
+	selp.b32	%r67, %r4, 0, %p113;
+	or.b32  	%r68, %r67, 2146435072;
+	setp.lt.s32	%p119, %r5, 0;
+	selp.b32	%r69, %r68, %r67, %p119;
+	mov.u32 	%r70, 0;
+	mov.b64 	%fd106, {%r70, %r69};
+	bra.uni 	BB24_125;
+
+BB24_96:
+	setp.gt.s32	%p71, %r6, 15;
+	@%p71 bra 	BB24_100;
+
+	setp.eq.s32	%p75, %r6, 14;
+	@%p75 bra 	BB24_113;
+	bra.uni 	BB24_98;
+
+BB24_113:
 	cvt.rni.s64.f64	%rd15, %fd1;
 	cvt.rni.s64.f64	%rd16, %fd68;
-	cvt.u32.u64	%r55, %rd15;
-	cvt.u32.u64	%r56, %rd16;
-	or.b32  	%r57, %r56, %r55;
-	setp.eq.s32	%p111, %r57, 0;
-	selp.f64	%fd106, 0d0000000000000000, 0d3FF0000000000000, %p111;
-	bra.uni 	BB12_129;
-
-BB12_8:
+	cvt.u32.u64	%r56, %rd15;
+	cvt.u32.u64	%r57, %rd16;
+	or.b32  	%r58, %r57, %r56;
+	setp.eq.s32	%p105, %r58, 0;
+	selp.f64	%fd108, 0d0000000000000000, 0d3FF0000000000000, %p105;
+	bra.uni 	BB24_137;
+
+BB24_8:
 	setp.eq.s32	%p27, %r6, 2;
-	@%p27 bra 	BB12_63;
-	bra.uni 	BB12_9;
+	@%p27 bra 	BB24_67;
+	bra.uni 	BB24_9;
 
-BB12_63:
-	mul.f64 	%fd98, %fd1, %fd68;
-	bra.uni 	BB12_65;
+BB24_67:
+	mul.f64 	%fd99, %fd1, %fd68;
+	bra.uni 	BB24_69;
 
-BB12_24:
+BB24_24:
 	setp.eq.s32	%p14, %r6, 11;
-	@%p14 bra 	BB12_45;
+	@%p14 bra 	BB24_47;
 
 	setp.eq.s32	%p15, %r6, 12;
-	@%p15 bra 	BB12_44;
-	bra.uni 	BB12_26;
+	@%p15 bra 	BB24_46;
+	bra.uni 	BB24_26;
 
-BB12_44:
-	max.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_46:
+	max.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_15:
+BB24_15:
 	setp.eq.s32	%p21, %r6, 6;
-	@%p21 bra 	BB12_48;
+	@%p21 bra 	BB24_50;
 
 	setp.eq.s32	%p22, %r6, 7;
-	@%p22 bra 	BB12_47;
-	bra.uni 	BB12_17;
+	@%p22 bra 	BB24_49;
+	bra.uni 	BB24_17;
 
-BB12_47:
-	setp.lt.f64	%p48, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p48;
-	bra.uni 	BB12_65;
+BB24_49:
+	setp.lt.f64	%p46, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p46;
+	bra.uni 	BB24_69;
 
-BB12_32:
+BB24_32:
 	setp.eq.s32	%p8, %r6, 16;
-	@%p8 bra 	BB12_42;
+	@%p8 bra 	BB24_44;
 
 	setp.eq.s32	%p9, %r6, 17;
-	@%p9 bra 	BB12_38;
-	bra.uni 	BB12_34;
+	@%p9 bra 	BB24_39;
+	bra.uni 	BB24_34;
 
-BB12_38:
-	setp.eq.f64	%p35, %fd1, 0d0000000000000000;
-	setp.eq.f64	%p36, %fd1, 0d8000000000000000;
-	or.pred  	%p37, %p35, %p36;
-	mov.f64 	%fd98, 0d7FF8000000000000;
-	@%p37 bra 	BB12_65;
+BB24_39:
+	setp.eq.f64	%p34, %fd1, 0d0000000000000000;
+	setp.eq.f64	%p35, %fd1, 0d8000000000000000;
+	or.pred  	%p36, %p34, %p35;
+	mov.f64 	%fd99, 0d7FF8000000000000;
+	@%p36 bra 	BB24_69;
 
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	abs.f64 	%fd72, %fd98;
-	setp.gtu.f64	%p38, %fd72, 0d7FF0000000000000;
-	@%p38 bra 	BB12_65;
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	abs.f64 	%fd72, %fd99;
+	setp.gtu.f64	%p37, %fd72, 0d7FF0000000000000;
+	@%p37 bra 	BB24_69;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r15, %temp}, %fd98;
+	mov.b64 	{%temp, %r15}, %fd99;
 	}
+	and.b32  	%r16, %r15, 2147483647;
+	setp.ne.s32	%p38, %r16, 2146435072;
+	@%p38 bra 	BB24_43;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r16}, %fd98;
+	mov.b64 	{%r17, %temp}, %fd99;
 	}
-	and.b32  	%r17, %r16, 2147483647;
-	setp.ne.s32	%p39, %r17, 2146435072;
-	setp.ne.s32	%p40, %r15, 0;
-	or.pred  	%p41, %p39, %p40;
-	@!%p41 bra 	BB12_65;
-	bra.uni 	BB12_41;
-
-BB12_41:
-	cvt.rmi.f64.f64	%fd73, %fd98;
+	setp.eq.s32	%p39, %r17, 0;
+	@%p39 bra 	BB24_69;
+
+BB24_43:
+	cvt.rmi.f64.f64	%fd73, %fd99;
 	mul.f64 	%fd74, %fd1, %fd73;
-	sub.f64 	%fd98, %fd68, %fd74;
-	bra.uni 	BB12_65;
-
-BB12_72:
-	setp.eq.s32	%p95, %r6, 2;
-	@%p95 bra 	BB12_127;
-	bra.uni 	BB12_73;
-
-BB12_127:
-	mul.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_88:
-	setp.eq.s32	%p82, %r6, 11;
-	@%p82 bra 	BB12_109;
-
-	setp.eq.s32	%p83, %r6, 12;
-	@%p83 bra 	BB12_108;
-	bra.uni 	BB12_90;
-
-BB12_108:
-	max.f64 	%fd106, %fd1, %fd68;
-	bra.uni 	BB12_129;
-
-BB12_79:
-	setp.eq.s32	%p89, %r6, 6;
-	@%p89 bra 	BB12_112;
-
-	setp.eq.s32	%p90, %r6, 7;
-	@%p90 bra 	BB12_111;
-	bra.uni 	BB12_81;
-
-BB12_111:
-	setp.gt.f64	%p116, %fd1, %fd68;
-	selp.f64	%fd106, 0d3FF0000000000000, 0d0000000000000000, %p116;
-	bra.uni 	BB12_129;
-
-BB12_96:
-	setp.eq.s32	%p76, %r6, 16;
-	@%p76 bra 	BB12_106;
-
-	setp.eq.s32	%p77, %r6, 17;
-	@%p77 bra 	BB12_102;
-	bra.uni 	BB12_98;
-
-BB12_102:
-	setp.eq.f64	%p103, %fd68, 0d0000000000000000;
-	setp.eq.f64	%p104, %fd68, 0d8000000000000000;
-	or.pred  	%p105, %p103, %p104;
-	mov.f64 	%fd106, 0d7FF8000000000000;
-	@%p105 bra 	BB12_129;
-
-	div.rn.f64 	%fd106, %fd1, %fd68;
-	abs.f64 	%fd83, %fd106;
-	setp.gtu.f64	%p106, %fd83, 0d7FF0000000000000;
-	@%p106 bra 	BB12_129;
+	sub.f64 	%fd99, %fd68, %fd74;
+	bra.uni 	BB24_69;
+
+BB24_76:
+	setp.eq.s32	%p91, %r6, 2;
+	@%p91 bra 	BB24_135;
+	bra.uni 	BB24_77;
+
+BB24_135:
+	mul.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_92:
+	setp.eq.s32	%p78, %r6, 11;
+	@%p78 bra 	BB24_115;
+
+	setp.eq.s32	%p79, %r6, 12;
+	@%p79 bra 	BB24_114;
+	bra.uni 	BB24_94;
+
+BB24_114:
+	max.f64 	%fd108, %fd1, %fd68;
+	bra.uni 	BB24_137;
+
+BB24_83:
+	setp.eq.s32	%p85, %r6, 6;
+	@%p85 bra 	BB24_118;
+
+	setp.eq.s32	%p86, %r6, 7;
+	@%p86 bra 	BB24_117;
+	bra.uni 	BB24_85;
+
+BB24_117:
+	setp.gt.f64	%p110, %fd1, %fd68;
+	selp.f64	%fd108, 0d3FF0000000000000, 0d0000000000000000, %p110;
+	bra.uni 	BB24_137;
+
+BB24_100:
+	setp.eq.s32	%p72, %r6, 16;
+	@%p72 bra 	BB24_112;
+
+	setp.eq.s32	%p73, %r6, 17;
+	@%p73 bra 	BB24_107;
+	bra.uni 	BB24_102;
+
+BB24_107:
+	setp.eq.f64	%p98, %fd68, 0d0000000000000000;
+	setp.eq.f64	%p99, %fd68, 0d8000000000000000;
+	or.pred  	%p100, %p98, %p99;
+	mov.f64 	%fd108, 0d7FF8000000000000;
+	@%p100 bra 	BB24_137;
+
+	div.rn.f64 	%fd108, %fd1, %fd68;
+	abs.f64 	%fd83, %fd108;
+	setp.gtu.f64	%p101, %fd83, 0d7FF0000000000000;
+	@%p101 bra 	BB24_137;
 
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%r52, %temp}, %fd106;
+	mov.b64 	{%temp, %r53}, %fd108;
 	}
+	and.b32  	%r54, %r53, 2147483647;
+	setp.ne.s32	%p102, %r54, 2146435072;
+	@%p102 bra 	BB24_111;
+
 	{
 	.reg .b32 %temp; 
-	mov.b64 	{%temp, %r53}, %fd106;
+	mov.b64 	{%r55, %temp}, %fd108;
 	}
-	and.b32  	%r54, %r53, 2147483647;
-	setp.ne.s32	%p107, %r54, 2146435072;
-	setp.ne.s32	%p108, %r52, 0;
-	or.pred  	%p109, %p107, %p108;
-	@!%p109 bra 	BB12_129;
-	bra.uni 	BB12_105;
-
-BB12_105:
-	cvt.rmi.f64.f64	%fd84, %fd106;
+	setp.eq.s32	%p103, %r55, 0;
+	@%p103 bra 	BB24_137;
+
+BB24_111:
+	cvt.rmi.f64.f64	%fd84, %fd108;
 	mul.f64 	%fd85, %fd84, %fd68;
-	sub.f64 	%fd106, %fd1, %fd85;
-	bra.uni 	BB12_129;
+	sub.f64 	%fd108, %fd1, %fd85;
+	bra.uni 	BB24_137;
 
-BB12_6:
+BB24_6:
 	setp.eq.s32	%p30, %r6, 1;
-	@%p30 bra 	BB12_7;
-	bra.uni 	BB12_65;
+	@%p30 bra 	BB24_7;
+	bra.uni 	BB24_69;
 
-BB12_7:
-	sub.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_7:
+	sub.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_22:
+BB24_22:
 	setp.eq.s32	%p18, %r6, 10;
-	@%p18 bra 	BB12_23;
-	bra.uni 	BB12_65;
+	@%p18 bra 	BB24_23;
+	bra.uni 	BB24_69;
 
-BB12_23:
-	setp.neu.f64	%p45, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p45;
-	bra.uni 	BB12_65;
+BB24_23:
+	setp.neu.f64	%p43, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p43;
+	bra.uni 	BB24_69;
 
-BB12_13:
+BB24_13:
 	setp.eq.s32	%p25, %r6, 5;
-	@%p25 bra 	BB12_14;
-	bra.uni 	BB12_65;
+	@%p25 bra 	BB24_14;
+	bra.uni 	BB24_69;
 
-BB12_14:
-	setp.gt.f64	%p50, %fd1, %fd68;
-	selp.f64	%fd98, 0d3FF0000000000000, 0d0000000000000000, %p50;
-	bra.uni 	BB12_65;
+BB24_14:
+	setp.gt.f64	%p48, %fd1, %fd68;
+	selp.f64	%fd99, 0d3FF0000000000000, 0d0000000000000000, %p48;
+	bra.uni 	BB24_69;
 
-BB12_30:
+BB24_30:
 	setp.eq.s32	%p12, %r6, 15;
-	@%p12 bra 	BB12_31;
-	bra.uni 	BB12_65;
+	@%p12 bra 	BB24_31;
+	bra.uni 	BB24_69;
 
-BB12_31:
+BB24_31:
 	mul.f64 	%fd76, %fd1, %fd68;
 	mov.f64 	%fd77, 0d3FF0000000000000;
-	sub.f64 	%fd98, %fd77, %fd76;
-	bra.uni 	BB12_65;
+	sub.f64 	%fd99, %fd77, %fd76;
+	bra.uni 	BB24_69;
 
-BB12_9:
+BB24_9:
 	setp.eq.s32	%p28, %r6, 3;
-	@%p28 bra 	BB12_10;
-	bra.uni 	BB12_65;
+	@%p28 bra 	BB24_10;
+	bra.uni 	BB24_69;
 
-BB12_10:
-	div.rn.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_10:
+	div.rn.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_45:
-	min.f64 	%fd98, %fd68, %fd1;
-	bra.uni 	BB12_65;
+BB24_47:
+	min.f64 	%fd99, %fd68, %fd1;
+	bra.uni 	BB24_69;
 
-BB12_26:
+BB24_26:
 	setp.eq.s32	%p16, %r6, 13;
-	@%p16 bra 	BB12_27;
-	bra.uni 	BB12_65;
+	@%p16 bra 	BB24_27;
+	bra.uni 	BB24_69;
 
-BB12_27:
+BB24_27:
 	cvt.rni.s64.f64	%rd12, %fd68;
 	cvt.rni.s64.f64	%rd13, %fd1;
 	cvt.u32.u64	%r21, %rd12;
 	cvt.u32.u64	%r22, %rd13;
 	and.b32  	%r23, %r22, %r21;
-	setp.eq.s32	%p44, %r23, 0;
-	selp.f64	%fd98, 0d0000000000000000, 0d3FF0000000000000, %p44;
-	bra.uni 	BB12_65;
+	setp.eq.s32	%p42, %r23, 0;
+	selp.f64	%fd99, 0d0000000000000000, 0d3FF0000000000000, %p42;
+	bra.uni 	BB24_69;
 
-BB12_48:

<TRUNCATED>

[32/50] [abbrv] systemml git commit: [MINOR] Enable single precision GPU tests

Posted by re...@apache.org.

[MINOR] Enable single precision GPU tests

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f0406746
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f0406746
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f0406746

Branch: refs/heads/master
Commit: f040674661ae818d0379abbcac624a726d3b3e3a
Parents: d3917ef
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 20:29:55 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 20:29:55 2017 -0700

----------------------------------------------------------------------
 src/test/java/org/apache/sysml/test/gpu/GPUTests.java | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f0406746/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index d7d1ad5..a83b110 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -55,7 +55,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	private static final boolean PRINT_MAT_ERROR = false;
 	
 	// We will use this flag until lower precision is supported on CP. 
-	private final static String DATA_TYPE = "double";  
+	private final static String FLOATING_POINT_PRECISION = "double";  
 	protected final double SINGLE_PRECISION_THRESHOLD = 1e-3;    // for relative error
 	
 	
@@ -75,9 +75,9 @@ public abstract class GPUTests extends AutomatedTestBase {
 	 * @return a valid threshold
 	 */
 	protected double getTHRESHOLD() {
-		if(DATA_TYPE.equals("double"))  return DOUBLE_PRECISION_THRESHOLD;
-		else if(DATA_TYPE.equals("float"))  return SINGLE_PRECISION_THRESHOLD;
-		else throw new RuntimeException("Unsupported datatype:" + DATA_TYPE);
+		if(FLOATING_POINT_PRECISION.equals("double"))  return DOUBLE_PRECISION_THRESHOLD;
+		else if(FLOATING_POINT_PRECISION.equals("single"))  return SINGLE_PRECISION_THRESHOLD;
+		else throw new RuntimeException("Unsupported precision:" + FLOATING_POINT_PRECISION);
 	}
 
 	@After
@@ -263,7 +263,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 						format.format(
 								"Relative error(%f) is more than threshold (%f). Expected = %f, Actual = %f, differed at [%d, %d]",
 								relativeError, getTHRESHOLD(), expectedDouble, actualDouble, i, j);
-						if(DATA_TYPE.equals("double"))
+						if(FLOATING_POINT_PRECISION.equals("double"))
 							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD());
 						else
 							Assert.assertTrue(format.toString(), relativeError < getTHRESHOLD() || absoluteError < getTHRESHOLD());
@@ -324,7 +324,7 @@ public abstract class GPUTests extends AutomatedTestBase {
 	protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
 			List<String> outStrs) {
 		MLContext gpuMLC = new MLContext(spark);
-		gpuMLC.setConfigProperty("sysml.gpu.dataType", DATA_TYPE);
+		gpuMLC.setConfigProperty("sysml.floating.point.precision", FLOATING_POINT_PRECISION);
 		gpuMLC.setGPU(true);
 		gpuMLC.setForceGPU(true);
 		gpuMLC.setStatistics(true);

[47/50] [abbrv] systemml git commit: [SYSTEMML-1981] Fix graceful value type casts on function invocations

Posted by re...@apache.org.

[SYSTEMML-1981] Fix graceful value type casts on function invocations

The existing graceful value type casts - on function invocations with
wrong value type - incorrectly took the input parameter type instead of
the function parameter type for comparison. This patch fixes this
casting issue, which avoids unnecessary warnings on function invocations
and recompilation exceptions for boolean parameters.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a2f0598c
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a2f0598c
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a2f0598c

Branch: refs/heads/master
Commit: a2f0598c606db16e75790cdbc3dbe37dc32d89a0
Parents: fc47891
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Nov 1 21:37:49 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Nov 2 00:39:16 2017 -0700

----------------------------------------------------------------------
 .../cp/FunctionCallCPInstruction.java           | 29 ++++----
 .../functions/misc/FunctionReturnTest.java      | 78 ++++++++++++++++++++
 .../functions/misc/FunctionReturnBoolean.dml    | 34 +++++++++
 .../functions/misc/FunctionReturnInt.dml        | 34 +++++++++
 .../functions/misc/ZPackageSuite.java           |  1 +
 5 files changed, 160 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a2f0598c/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 402d4a5..b901dfc 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -60,7 +60,6 @@ public class FunctionCallCPInstruction extends CPInstruction {
 	private FunctionCallCPInstruction(String namespace, String functName, ArrayList<CPOperand> boundInParamOperands,
 			ArrayList<String> boundInParamNames, ArrayList<String> boundOutParamNames, String istr) {
 		super(null, functName, istr);
-
 		_cptype = CPINSTRUCTION_TYPE.External;
 		_functionName = functName;
 		_namespace = namespace;
@@ -72,7 +71,7 @@ public class FunctionCallCPInstruction extends CPInstruction {
 
 	public static FunctionCallCPInstruction parseInstruction(String str) 
 		throws DMLRuntimeException 
-	{	
+	{
 		//schema: extfunct, fname, num inputs, num outputs, inputs, outputs
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType ( str );
 		String namespace = parts[1];
@@ -94,8 +93,7 @@ public class FunctionCallCPInstruction extends CPInstruction {
 		return new FunctionCallCPInstruction ( namespace,functionName, 
 				boundInParamOperands, boundInParamNames, boundOutParamNames, str );
 	}
-
-		
+	
 	@Override
 	public Instruction preprocessInstruction(ExecutionContext ec)
 		throws DMLRuntimeException 
@@ -114,7 +112,7 @@ public class FunctionCallCPInstruction extends CPInstruction {
 	@Override
 	public void processInstruction(ExecutionContext ec) 
 		throws DMLRuntimeException 
-	{		
+	{
 		if( LOG.isTraceEnabled() ){
 			LOG.trace("Executing instruction : " + this.toString());
 		}
@@ -130,19 +128,19 @@ public class FunctionCallCPInstruction extends CPInstruction {
 		
 		// create bindings to formal parameters for given function call
 		// These are the bindings passed to the FunctionProgramBlock for function execution 
-		LocalVariableMap functionVariables = new LocalVariableMap();		
+		LocalVariableMap functionVariables = new LocalVariableMap();
 		for( int i=0; i<fpb.getInputParams().size(); i++) 
-		{				
+		{
 			DataIdentifier currFormalParam = fpb.getInputParams().get(i);
 			String currFormalParamName = currFormalParam.getName();
 			Data currFormalParamValue = null; 
-				
+			
 			CPOperand operand = _boundInputParamOperands.get(i);
 			String varname = operand.getName();
 			//error handling non-existing variables
 			if( !operand.isLiteral() && !ec.containsVariable(varname) ) {
 				throw new DMLRuntimeException("Input variable '"+varname+"' not existing on call of " + 
-						DMLProgram.constructFunctionKey(_namespace, _functionName) + " (line "+getLineNum()+").");
+					DMLProgram.constructFunctionKey(_namespace, _functionName) + " (line "+getLineNum()+").");
 			}
 			//get input matrix/frame/scalar
 			currFormalParamValue = (operand.getDataType()!=DataType.SCALAR) ? ec.getVariable(varname) : 
@@ -150,19 +148,18 @@ public class FunctionCallCPInstruction extends CPInstruction {
 			
 			//graceful value type conversion for scalar inputs with wrong type
 			if( currFormalParamValue.getDataType() == DataType.SCALAR
-				&& currFormalParamValue.getValueType() != operand.getValueType() )
+				&& currFormalParamValue.getValueType() != currFormalParam.getValueType() ) 
 			{
-				ScalarObject so = (ScalarObject) currFormalParamValue;
-				currFormalParamValue = ScalarObjectFactory
-					.createScalarObject(operand.getValueType(), so);
+				currFormalParamValue = ScalarObjectFactory.createScalarObject(
+					currFormalParam.getValueType(), (ScalarObject) currFormalParamValue);
 			}
 			
-			functionVariables.put(currFormalParamName, currFormalParamValue);						
+			functionVariables.put(currFormalParamName, currFormalParamValue);
 		}
 		
 		// Pin the input variables so that they do not get deleted 
 		// from pb's symbol table at the end of execution of function
-	    HashMap<String,Boolean> pinStatus = ec.pinVariables(_boundInputParamNames);
+		HashMap<String,Boolean> pinStatus = ec.pinVariables(_boundInputParamNames);
 		
 		// Create a symbol table under a new execution context for the function invocation,
 		// and copy the function arguments into the created table. 
@@ -185,7 +182,7 @@ public class FunctionCallCPInstruction extends CPInstruction {
 			String fname = DMLProgram.constructFunctionKey(_namespace, _functionName);
 			throw new DMLRuntimeException("error executing function " + fname, e);
 		}
-		LocalVariableMap retVars = fn_ec.getVariables();  
+		LocalVariableMap retVars = fn_ec.getVariables();
 		
 		// cleanup all returned variables w/o binding 
 		Collection<String> retVarnames = new LinkedList<>(retVars.keySet());

http://git-wip-us.apache.org/repos/asf/systemml/blob/a2f0598c/src/test/java/org/apache/sysml/test/integration/functions/misc/FunctionReturnTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/misc/FunctionReturnTest.java b/src/test/java/org/apache/sysml/test/integration/functions/misc/FunctionReturnTest.java
new file mode 100644
index 0000000..b83ac39
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/misc/FunctionReturnTest.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.misc;
+
+import org.junit.Test;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+
+public class FunctionReturnTest extends AutomatedTestBase 
+{
+	private final static String TEST_DIR = "functions/misc/";
+	private final static String TEST_NAME1 = "FunctionReturnInt";
+	private final static String TEST_NAME2 = "FunctionReturnBoolean";
+	private final static String TEST_CLASS_DIR = TEST_DIR + FunctionReturnTest.class.getSimpleName() + "/";
+	
+	@Override
+	public void setUp() {
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "Rout" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "Rout" }) );
+	}
+
+	@Test
+	public void testFunctionReturnInt() {
+		runFunctionReturnTest(TEST_NAME1, false);
+	}
+	
+	@Test
+	public void testFunctionReturnBool() {
+		runFunctionReturnTest(TEST_NAME2, false);
+	}
+	
+	@Test
+	public void testFunctionReturnIntIPA() {
+		runFunctionReturnTest(TEST_NAME1, true);
+	}
+	
+	@Test
+	public void testFunctionReturnBoolIPA() {
+		runFunctionReturnTest(TEST_NAME2, true);
+	}
+
+	private void runFunctionReturnTest( String testname, boolean IPA ) {
+		boolean oldIPA = OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS;
+		OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS = IPA;
+		try {
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain"};
+	
+			runTest(true, false, null, -1); 
+		}
+		finally {
+			OptimizerUtils.ALLOW_INTER_PROCEDURAL_ANALYSIS = oldIPA;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/a2f0598c/src/test/scripts/functions/misc/FunctionReturnBoolean.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/FunctionReturnBoolean.dml b/src/test/scripts/functions/misc/FunctionReturnBoolean.dml
new file mode 100644
index 0000000..afa563b
--- /dev/null
+++ b/src/test/scripts/functions/misc/FunctionReturnBoolean.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[double] X, boolean input) return (boolean out) {
+  out = input;
+  for(i in 1:2) {
+    tmp = sum(X);
+    out = input & as.logical(tmp);
+  }
+}
+
+X = seq(1,100);
+xmax = max(X);
+y = foo(X, xmax);
+print(y);

http://git-wip-us.apache.org/repos/asf/systemml/blob/a2f0598c/src/test/scripts/functions/misc/FunctionReturnInt.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/misc/FunctionReturnInt.dml b/src/test/scripts/functions/misc/FunctionReturnInt.dml
new file mode 100644
index 0000000..ba916a4
--- /dev/null
+++ b/src/test/scripts/functions/misc/FunctionReturnInt.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+foo = function(Matrix[double] X, int input) return (int out) {
+  out = input;
+  for(i in 1:2) {
+    tmp = sum(X);
+    out = input + as.integer(tmp);
+  }
+}
+
+X = seq(1,100);
+xmax = max(X);
+y = foo(X, xmax);
+print(y);

http://git-wip-us.apache.org/repos/asf/systemml/blob/a2f0598c/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
index cac39e1..a453cbd 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/misc/ZPackageSuite.java
@@ -31,6 +31,7 @@ import org.junit.runners.Suite;
 	DataTypeChangeTest.class,
 	FunctionInliningTest.class,
 	FunctionNamespaceTest.class,
+	FunctionReturnTest.class,
 	IfTest.class,
 	InvalidBuiltinFunctionCallTest.class,
 	InvalidFunctionAssignmentTest.class,

[13/50] [abbrv] systemml git commit: [SYSTEMML-1903] Fix robustness codegen row ops w/ unknowns

Posted by re...@apache.org.

[SYSTEMML-1903] Fix robustness codegen row ops w/ unknowns

This patch fixes special cases of codegen row templates with partial
unknowns, which is important for robustness during initial compilation
even though the unknowns led to dynamic recompilation during runtime.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/323dd72a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/323dd72a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/323dd72a

Branch: refs/heads/master
Commit: 323dd72a8ed18687aa3019387c4ab7b0598bd9d5
Parents: 4f29b34
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Oct 19 15:07:54 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 19 16:06:14 2017 -0700

----------------------------------------------------------------------
 .../hops/codegen/template/TemplateRow.java      | 38 ++++++++++----------
 .../hops/codegen/template/TemplateUtils.java    |  2 +-
 2 files changed, 21 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/323dd72a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 0389983..e664b9f 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -250,7 +250,7 @@ public class TemplateRow extends TemplateBase
 			else if (((AggUnaryOp)hop).getDirection() == Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM ) {
 				//vector add without temporary copy
 				if( cdata1 instanceof CNodeBinary && ((CNodeBinary)cdata1).getType().isVectorScalarPrimitive() )
-					out = new CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), 
+					out = new CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1),
 							((CNodeBinary)cdata1).getType().getVectorAddPrimitive());
 				else	
 					out = cdata1;
@@ -269,7 +269,7 @@ public class TemplateRow extends TemplateBase
 			{
 				//correct input under transpose
 				cdata1 = TemplateUtils.skipTranspose(cdata1, hop.getInput().get(0), tmp, compileLiterals);
-				inHops.remove(hop.getInput().get(0)); 
+				inHops.remove(hop.getInput().get(0));
 				inHops.add(hop.getInput().get(0).getInput().get(0));
 				
 				//note: vectorMultAdd applicable to vector-scalar, and vector-vector
@@ -310,7 +310,8 @@ public class TemplateRow extends TemplateBase
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
 			
 			// if one input is a matrix then we need to do vector by scalar operations
-			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 ) 
+			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 
+				|| (!hop.dimsKnown() && cdata1.getDataType()==DataType.MATRIX ) ) 
 			{
 				if( HopRewriteUtils.isUnary(hop, SUPPORTED_VECT_UNARY) ) {
 					String opname = "VECT_"+((UnaryOp)hop).getOp().name();
@@ -320,12 +321,11 @@ public class TemplateRow extends TemplateBase
 				}
 				else 
 					throw new RuntimeException("Unsupported unary matrix "
-							+ "operation: " + ((UnaryOp)hop).getOp().name());
+						+ "operation: " + ((UnaryOp)hop).getOp().name());
 			}
 			else //general scalar case
 			{
 				cdata1 = TemplateUtils.wrapLookupIfNecessary(cdata1, hop.getInput().get(0));
-				
 				String primitiveOpName = ((UnaryOp)hop).getOp().toString();
 				out = new CNodeUnary(cdata1, UnaryType.valueOf(primitiveOpName));
 			}
@@ -355,7 +355,9 @@ public class TemplateRow extends TemplateBase
 			
 			// if one input is a matrix then we need to do vector by scalar operations
 			if( (hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1)
-				|| (hop.getInput().get(1).getDim1() > 1 && hop.getInput().get(1).getDim2() > 1))
+				|| (hop.getInput().get(1).getDim1() > 1 && hop.getInput().get(1).getDim2() > 1)
+				|| (!(hop.dimsKnown() && hop.getInput().get(0).dimsKnown() && hop.getInput().get(1).dimsKnown()) 
+						&& (cdata1.getDataType().isMatrix() || cdata2.getDataType().isMatrix())))
 			{
 				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
 					if( TemplateUtils.isMatrix(cdata1) && (TemplateUtils.isMatrix(cdata2) 
@@ -371,14 +373,14 @@ public class TemplateRow extends TemplateBase
 							cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
 						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
 					}
-					if( cdata1 instanceof CNodeData && inHops2.isEmpty() 
+					if( cdata1 instanceof CNodeData && inHops2.isEmpty()
 						&& !(cdata1.getDataType()==DataType.SCALAR) ) {
 						inHops2.put("X", hop.getInput().get(0));
 					}
 				}
 				else 
 					throw new RuntimeException("Unsupported binary matrix "
-							+ "operation: " + ((BinaryOp)hop).getOp().name());
+						+ "operation: " + ((BinaryOp)hop).getOp().name());
 			}
 			else //one input is a vector/scalar other is a scalar
 			{
@@ -389,7 +391,7 @@ public class TemplateRow extends TemplateBase
 					|| (TemplateUtils.isColVector(hop.getInput().get(0)) && cdata2 instanceof CNodeData
 						&& hop.getInput().get(1).getDataType().isMatrix()))
 					cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
-				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));	
+				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
 			}
 		}
 		else if(hop instanceof TernaryOp) 
@@ -405,16 +407,16 @@ public class TemplateRow extends TemplateBase
 			
 			//construct ternary cnode, primitive operation derived from OpOp3
 			out = new CNodeTernary(cdata1, cdata2, cdata3, 
-					TernaryType.valueOf(top.getOp().toString()));
+				TernaryType.valueOf(top.getOp().toString()));
 		}
-		else if( hop instanceof ParameterizedBuiltinOp ) 
+		else if( hop instanceof ParameterizedBuiltinOp )
 		{
 			CNode cdata1 = tmp.get(((ParameterizedBuiltinOp)hop).getTargetHop().getHopID());
 			cdata1 = TemplateUtils.wrapLookupIfNecessary(cdata1, hop.getInput().get(0));
 			
 			CNode cdata2 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("pattern").getHopID());
 			CNode cdata3 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("replacement").getHopID());
-			TernaryType ttype = (cdata2.isLiteral() && cdata2.getVarname().equals("Double.NaN")) ? 
+			TernaryType ttype = (cdata2.isLiteral() && cdata2.getVarname().equals("Double.NaN")) ?
 					TernaryType.REPLACE_NAN : TernaryType.REPLACE;
 			out = new CNodeTernary(cdata1, cdata2, cdata3, ttype);
 		}
@@ -422,7 +424,7 @@ public class TemplateRow extends TemplateBase
 		{
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
 			out = new CNodeTernary(cdata1, 
-				TemplateUtils.createCNodeData(new LiteralOp(hop.getInput().get(0).getDim2()), true), 
+				TemplateUtils.createCNodeData(new LiteralOp(hop.getInput().get(0).getDim2()), true),
 				TemplateUtils.createCNodeData(hop.getInput().get(4), true),
 				(!hop.dimsKnown()||hop.getDim2()>1) ? TernaryType.LOOKUP_RVECT1 : TernaryType.LOOKUP_RC1);
 		}
@@ -456,13 +458,13 @@ public class TemplateRow extends TemplateBase
 		
 		@Override
 		public int compare(Hop h1, Hop h2) {
-			long ncells1 = h1.isScalar() ? Long.MIN_VALUE : 
-				(h1==_X) ? Long.MAX_VALUE : (h1==_B1) ? Long.MAX_VALUE-1 : 
+			long ncells1 = h1.isScalar() ? Long.MIN_VALUE :
+				(h1==_X) ? Long.MAX_VALUE : (h1==_B1) ? Long.MAX_VALUE-1 :
 				h1.dimsKnown() ? h1.getLength() : Long.MAX_VALUE-2;
-			long ncells2 = h2.isScalar() ? Long.MIN_VALUE : 
-				(h2==_X) ? Long.MAX_VALUE : (h2==_B1) ? Long.MAX_VALUE-1 : 
+			long ncells2 = h2.isScalar() ? Long.MIN_VALUE :
+				(h2==_X) ? Long.MAX_VALUE : (h2==_B1) ? Long.MAX_VALUE-1 :
 				h2.dimsKnown() ? h2.getLength() : Long.MAX_VALUE-2;
-			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0; 
+			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0;
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/323dd72a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index 497dae0..96e15cb 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -184,7 +184,7 @@ public class TemplateUtils
 	public static RowType getRowType(Hop output, Hop... inputs) {
 		Hop X = inputs[0];
 		Hop B1 = (inputs.length>1) ? inputs[1] : null;
-		if( (X!=null && HopRewriteUtils.isEqualSize(output, X)) || X==null )
+		if( (X!=null && HopRewriteUtils.isEqualSize(output, X)) || X==null || !X.dimsKnown() )
 			return RowType.NO_AGG;
 		else if( ((B1!=null && output.getDim1()==X.getDim1() && output.getDim2()==B1.getDim2())
 			|| (output instanceof IndexingOp && HopRewriteUtils.isColumnRangeIndexing((IndexingOp)output)))

[46/50] [abbrv] systemml git commit: [SYSTEMML-1980] HopDagValidator: Accept Integer Matrices

Posted by re...@apache.org.

[SYSTEMML-1980] HopDagValidator: Accept Integer Matrices

Under rare conditions a matrix can have INT ValueType and execute correctly.  For example the program

```
X= Rand( rows=2, cols=2, min=1, max=2)
R = cbind(as.matrix(nrow(X)) * 2, as.matrix(ncol(X)))
```

would throw an exception by the HopValidator because the Hops produced look like
(here, "MI" means matrix data type and integer value type):

```
----GENERIC (lines 28-0) [recompile=false]
------(33) u(cast_as_matrix) ([2]) [1,1,1000,1000,-1]MI [0,0,0 -> -MB]
------(35) b(*) (33,[2]) [1,1,1000,1000,-1]MI [0,0,0 -> -MB], CP
------(37) u(cast_as_matrix) ([2]) [1,1,1000,1000,-1]MI [0,0,0 -> -MB]
------(38) b(cbind) (35,37) [1,2,1000,1000,-1]MI [0,0,0 -> -MB], CP
------(44) PWrite R (38,[target/tes...],[false],[TEXT],[false],[,]) [1,2,-1,-1,-1]MI [0,0,0 -> -MB], CP
```

This patch relaxes the HopValidator to allow Integer matrices.

Closes #695.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/fc478916
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/fc478916
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/fc478916

Branch: refs/heads/master
Commit: fc47891656e5f804ce8a9ba1085a79d04153a138
Parents: cb1d792
Author: Dylan Hutchison <dh...@cs.washington.edu>
Authored: Wed Nov 1 21:55:56 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Nov 1 21:55:57 2017 -0700

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/fc478916/src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java b/src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java
index 7d14532..ce4648a 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopDagValidator.java
@@ -126,7 +126,7 @@ public class HopDagValidator {
 
 		// check Matrix data type Hops must have Double Value type
 		if (dt == Expression.DataType.MATRIX )
-			check(vt == Expression.ValueType.DOUBLE, hop,
+			check(vt == Expression.ValueType.DOUBLE || vt == Expression.ValueType.INT, hop,
 				"has Matrix type but Value Type %s is not DOUBLE", vt);
 
 		//recursively process children

[19/50] [abbrv] systemml git commit: [SYSTEMML-1971] New codegen vector primitive for counting nnz

Posted by re...@apache.org.

[SYSTEMML-1971] New codegen vector primitive for counting nnz

This patch adds a new codegen vector primitive for rowSums(X!=0), i.e.
for counting the number of non-zeros, which avoids unnecessary dense row
intermediates and is realized as a pure meta data operation for sparse
row inputs. 

After recent optimizer changes, we compiled a row template in ALS-CG for
rowSums(X!=0), which showed severe performance issues on the amazon
books dataset. This was because amazon is a ultra-sparse dataset with
huge number of features (2330066), for which the dense row intermediates
are more than 7 orders of magnitude larger than the sparse row input.
This patch completely removed these performance issues.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/1191dbfe
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/1191dbfe
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/1191dbfe

Branch: refs/heads/master
Commit: 1191dbfe2a20d85bf79f0106312f37df210053cf
Parents: c70cb11
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 19:53:52 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 19:53:52 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/codegen/cplan/CNodeUnary.java |  9 ++++++---
 .../apache/sysml/hops/codegen/template/TemplateRow.java |  5 +++++
 .../sysml/hops/codegen/template/TemplateUtils.java      |  3 ++-
 .../org/apache/sysml/hops/rewrite/HopRewriteUtils.java  |  6 ++++++
 .../sysml/runtime/codegen/LibSpoofPrimitives.java       | 12 ++++++++++++
 5 files changed, 31 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/1191dbfe/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index b66423e..3a3dc79 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -30,7 +30,7 @@ public class CNodeUnary extends CNode
 {
 	public enum UnaryType {
 		LOOKUP_R, LOOKUP_C, LOOKUP_RC, LOOKUP0, //codegen specific
-		ROW_SUMS, ROW_MINS, ROW_MAXS, //codegen specific
+		ROW_SUMS, ROW_MINS, ROW_MAXS, ROW_COUNTNNZS, //codegen specific
 		VECT_EXP, VECT_POW2, VECT_MULT2, VECT_SQRT, VECT_LOG,
 		VECT_ABS, VECT_ROUND, VECT_CEIL, VECT_FLOOR, VECT_SIGN, 
 		VECT_SIN, VECT_COS, VECT_TAN, VECT_ASIN, VECT_ACOS, VECT_ATAN, 
@@ -52,8 +52,9 @@ public class CNodeUnary extends CNode
 			switch( this ) {
 				case ROW_SUMS:
 				case ROW_MINS:
-				case ROW_MAXS: {
-					String vectName = StringUtils.capitalize(this.toString().substring(4,7).toLowerCase());
+				case ROW_MAXS:
+				case ROW_COUNTNNZS: {
+					String vectName = StringUtils.capitalize(name().substring(4, name().length()-1).toLowerCase());
 					return sparse ? "    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, alen, len);\n": 
 									"    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n"; 
 				}
@@ -244,6 +245,7 @@ public class CNodeUnary extends CNode
 			case ROW_SUMS:  return "u(R+)";
 			case ROW_MINS:  return "u(Rmin)";
 			case ROW_MAXS:  return "u(Rmax)";
+			case ROW_COUNTNNZS: return "u(Rnnz)";
 			case VECT_EXP:
 			case VECT_POW2:
 			case VECT_MULT2:
@@ -308,6 +310,7 @@ public class CNodeUnary extends CNode
 			case ROW_SUMS:
 			case ROW_MINS:
 			case ROW_MAXS:
+			case ROW_COUNTNNZS:
 			case EXP:
 			case LOOKUP_R:
 			case LOOKUP_C:

http://git-wip-us.apache.org/repos/asf/systemml/blob/1191dbfe/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index e664b9f..9da04dc 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -240,6 +240,11 @@ public class TemplateRow extends TemplateBase
 			if( ((AggUnaryOp)hop).getDirection() == Direction.Row && HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
 				if(hop.getInput().get(0).getDim2()==1)
 					out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP_R);
+				else if( HopRewriteUtils.isAggUnaryOp(hop, AggOp.SUM) 
+					&& HopRewriteUtils.isBinaryMatrixScalar(hop.getInput().get(0), OpOp2.NOTEQUAL, 0)
+					&& cdata1 instanceof CNodeBinary ) {
+					out = new CNodeUnary(cdata1.getInput().get(0), UnaryType.ROW_COUNTNNZS);
+				}
 				else {
 					String opcode = "ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
 					out = new CNodeUnary(cdata1, UnaryType.valueOf(opcode));

http://git-wip-us.apache.org/repos/asf/systemml/blob/1191dbfe/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index 96e15cb..e07c410 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -312,7 +312,8 @@ public class TemplateUtils
 	public static boolean hasSingleOperation(CNodeTpl tpl) {
 		CNode output = tpl.getOutput();
 		return ((output instanceof CNodeUnary 
-				&& !TemplateUtils.isUnary(output, UnaryType.EXP, UnaryType.LOG)) 
+				&& !TemplateUtils.isUnary(output, 
+					UnaryType.EXP, UnaryType.LOG, UnaryType.ROW_COUNTNNZS)) 
 			|| (output instanceof CNodeBinary
 				&& !TemplateUtils.isBinary(output, BinType.VECT_OUTERMULT_ADD))) 
 			&& hasOnlyDataNodeOrLookupInputs(output);

http://git-wip-us.apache.org/repos/asf/systemml/blob/1191dbfe/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index 7bbfa52..68068eb 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -893,6 +893,12 @@ public class HopRewriteUtils
 		return ret;
 	}
 	
+	public static boolean isBinaryMatrixScalar(Hop hop, OpOp2 type, double val) {
+		return isBinary(hop, type)
+			&& (isLiteralOfValue(hop.getInput().get(0), val)
+			|| isLiteralOfValue(hop.getInput().get(1), val));
+	}
+	
 	public static boolean containsInput(Hop current, Hop probe) {
 		return rContainsInput(current, probe, new HashSet<Long>());	
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/1191dbfe/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 91fde5e..356c729 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -281,6 +281,18 @@ public class LibSpoofPrimitives
 		return (alen<len) ? Math.max(val, 0) : val;
 	}
 	
+	public static double vectCountnnz(double[] a, int ai, int len) { 
+		int count = 0;
+		for( int i = ai; i < ai+len; i++ )
+			count += (a[i] != 0) ? 1 : 0;
+		return count;
+	} 
+	
+	public static double vectCountnnz(double[] avals, int[] aix, int ai, int alen, int len) {
+		//pure meta data operation
+		return alen;
+	}
+	
 	//custom vector div
 	
 	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {

[20/50] [abbrv] systemml git commit: [MINOR] Improved autoencoder scripts (ordering row-shuffle/z-transform)

Posted by re...@apache.org.

[MINOR] Improved autoencoder scripts (ordering row-shuffle/z-transform)

This patch makes a minor performance improvement to the autoencoder
script. So far, we first applied z-transform followed but a random row
shuffling. Since the z-transform turns sparse datasets into dense, we
now first perform the row shuffling, which makes this permutation matrix
multiply significantly faster and can avoid unnecessary evictions.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a51f8e81
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a51f8e81
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a51f8e81

Branch: refs/heads/master
Commit: a51f8e8190281e4c32978d430c08c1b83083faa2
Parents: 1191dbf
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 22 21:01:37 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 22 21:01:37 2017 -0700

----------------------------------------------------------------------
 scripts/staging/autoencoder-2layer.dml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a51f8e81/scripts/staging/autoencoder-2layer.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/autoencoder-2layer.dml b/scripts/staging/autoencoder-2layer.dml
index a17d86e..9eee8ae 100644
--- a/scripts/staging/autoencoder-2layer.dml
+++ b/scripts/staging/autoencoder-2layer.dml
@@ -150,15 +150,15 @@ max_epochs = $EPOCH
 n = nrow(X)
 m = ncol(X)
 
+#randomly reordering rows
+permut = table(seq(1,n,1), order(target=Rand(rows=n, cols=1, min=0, max=1, pdf="uniform"), by=1, index.return=TRUE), n, n)
+X = permut %*% X
+
 #z-transform, whitening operator is better
 means = colSums(X)/n
 stds = sqrt((colSums(X^2)/n - means*means)*n/(n-1)) + 1e-17
 X = (X - means)/stds
 
-#randomly reordering rows
-permut = table(seq(1,n,1), order(target=Rand(rows=n, cols=1, min=0, max=1, pdf="uniform"), by=1, index.return=TRUE), n, n)
-X = permut %*% X
-
 W1 = sqrt(6)/sqrt(m + num_hidden1) * Rand(rows=num_hidden1, cols=m, min=-1, max=1, pdf="uniform")
 b1 = matrix(0, rows=num_hidden1, cols=1)
 W2 = sqrt(6)/sqrt(num_hidden1 + num_hidden2) * Rand(rows=num_hidden2, cols=num_hidden1, min=-1, max=1, pdf="uniform")

[15/50] [abbrv] systemml git commit: [SYSTEMML-1968] Improved codegen optimizer (cost, mat points, pruning)

Posted by re...@apache.org.

[SYSTEMML-1968] Improved codegen optimizer (cost, mat points, pruning)

This patch improves the cost-based codegen optimizer to address wrong
fusion decision for large-scale computations. In detail, this includes:

1) Cost model: The cost model now accounts the broadcast cost for side
inputs in distributed spark operations. Furthermore, this also includes
a fix of calculating the compute costs in case of a mix of row and cell
operations of different dimensions. 

2) Interesting points: To enable the reasoning about side inputs, we now
also consider template switches from cell to row templates as
interesting points.

3) Pruning of row templates: The above changes also revealed hidden
issues in the pruning of unnecessary row templates (conversion to cell
templates), which mistakenly removed necessary row templates, which
ultimately led to runtime errors.

On a large-scale scenario of L2SVM over a 200M x 100 dense input
(160GB), this patch improved the end-to-end runtime for 20 outer
iterations from 942s to 273s (w/o codegen: 644s).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/311e4aac
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/311e4aac
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/311e4aac

Branch: refs/heads/master
Commit: 311e4aac9833397908a083d0a48d5bd3ba086283
Parents: 6de8f05
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Oct 21 16:41:53 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 21 17:15:38 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/opt/PlanAnalyzer.java    |   2 +-
 .../opt/PlanSelectionFuseCostBasedV2.java       | 131 ++++++++++---------
 .../hops/codegen/template/CPlanMemoTable.java   |  25 ++--
 .../runtime/codegen/LibSpoofPrimitives.java     |   6 +-
 4 files changed, 91 insertions(+), 73 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/311e4aac/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
index 9910814..7d522b3 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanAnalyzer.java
@@ -267,7 +267,7 @@ public class PlanAnalyzer
 			for( int i=0; i<3; i++ ) {
 				if( refs[i] < 0 ) continue;
 				List<TemplateType> tmp = memo.getDistinctTemplateTypes(hopID, i, true);
-				if( memo.containsNotIn(refs[i], tmp, true, true) )
+				if( memo.containsNotIn(refs[i], tmp, true) )
 					ret.add(new InterestingPoint(DecisionType.TEMPLATE_CHANGE, hopID, refs[i]));
 			}
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/311e4aac/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
index d2ed3ac..10875e8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/opt/PlanSelectionFuseCostBasedV2.java
@@ -86,6 +86,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 	//to cover result allocation, write into main memory, and potential evictions
 	private static final double WRITE_BANDWIDTH = 2d*1024*1024*1024;  //2GB/s
 	private static final double READ_BANDWIDTH = 32d*1024*1024*1024;  //32GB/s
+	private static final double READ_BANDWIDTH_BROADCAST = WRITE_BANDWIDTH/4;
 	private static final double COMPUTE_BANDWIDTH = 2d*1024*1024*1024 //2GFLOPs/core
 		* InfrastructureAnalyzer.getLocalParallelism();
 	
@@ -146,7 +147,7 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 				getComputeCosts(memo.getHopRefs().get(hopID), computeCosts);
 			
 			//prepare pruning helpers and prune memo table w/ determined mat points
-			StaticCosts costs = new StaticCosts(computeCosts, getComputeCost(computeCosts, memo), 
+			StaticCosts costs = new StaticCosts(computeCosts, sumComputeCost(computeCosts), 
 				getReadCost(part, memo), getWriteCost(part.getRoots(), memo));
 			ReachabilityGraph rgraph = STRUCTURAL_PRUNING ? new ReachabilityGraph(part, memo) : null;
 			if( STRUCTURAL_PRUNING ) {
@@ -339,14 +340,9 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		return costs;
 	}
 	
-	private static double getComputeCost(HashMap<Long, Double> computeCosts, CPlanMemoTable memo) {
-		double costs = 0;
-		for( Entry<Long,Double> e : computeCosts.entrySet() ) {
-			Hop mainInput = memo.getHopRefs()
-				.get(e.getKey()).getInput().get(0);
-			costs += getSize(mainInput) * e.getValue() / COMPUTE_BANDWIDTH;
-		}
-		return costs;
+	private static double sumComputeCost(HashMap<Long, Double> computeCosts) {
+		return computeCosts.values().stream()
+			.mapToDouble(d -> d/COMPUTE_BANDWIDTH).sum();
 	}
 	
 	private static long getSize(Hop hop) {
@@ -567,33 +563,39 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		}
 	}
 	
-	private static boolean isRowTemplateWithoutAggOrVects(CPlanMemoTable memo, Hop current, HashSet<Long> visited) {
-		//consider all aggregations other than root operation
-		MemoTableEntry me = memo.getBest(current.getHopID(), TemplateType.ROW);
-		boolean ret = true;
-		for(int i=0; i<3; i++)
-			if( me.isPlanRef(i) )
-				ret &= rIsRowTemplateWithoutAggOrVects(memo, 
-					current.getInput().get(i), visited);
-		return ret;
+	private static HashSet<Long> getRowAggOpsWithRowRef(CPlanMemoTable memo, PlanPartition part) {
+		HashSet<Long> refAggs = new HashSet<>();
+		for( Long hopID : part.getPartition() ) {
+			if( !memo.contains(hopID, TemplateType.ROW) ) continue;
+			MemoTableEntry me = memo.getBest(hopID, TemplateType.ROW);
+			for(int i=0; i<3; i++)
+				if( me.isPlanRef(i) && memo.contains(me.input(i), TemplateType.ROW) 
+					&& isRowAggOp(memo.getHopRefs().get(me.input(i))))
+					refAggs.add(me.input(i));
+		}
+		return refAggs;
 	}
 	
-	private static boolean rIsRowTemplateWithoutAggOrVects(CPlanMemoTable memo, Hop current, HashSet<Long> visited) {
+	private static boolean rIsRowTemplateWithoutAggOrVects(CPlanMemoTable memo, Hop current, HashSet<Long> visited, boolean inclRoot) {
 		if( visited.contains(current.getHopID()) )
 			return true;
 		
-		boolean ret = true;
 		MemoTableEntry me = memo.getBest(current.getHopID(), TemplateType.ROW);
-		for(int i=0; i<3; i++)
+		boolean ret = !inclRoot || !isRowAggOp(current);
+		for(int i=0; i<3 && ret; i++)
 			if( me!=null && me.isPlanRef(i) )
-				ret &= rIsRowTemplateWithoutAggOrVects(memo, current.getInput().get(i), visited);
-		ret &= !(current instanceof AggUnaryOp || current instanceof AggBinaryOp
-			|| HopRewriteUtils.isBinary(current, OpOp2.CBIND));
+				ret &= rIsRowTemplateWithoutAggOrVects(memo, 
+					current.getInput().get(i), visited, true);
 		
 		visited.add(current.getHopID());
 		return ret;
 	}
 	
+	private static boolean isRowAggOp(Hop hop){
+		return (hop instanceof AggUnaryOp || hop instanceof AggBinaryOp
+			|| HopRewriteUtils.isBinary(hop, OpOp2.CBIND));
+	}
+	
 	private static void pruneInvalidAndSpecialCasePlans(CPlanMemoTable memo, PlanPartition part) 
 	{	
 		//prune invalid row entries w/ violated blocksize constraint
@@ -613,9 +615,8 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 						&& HopRewriteUtils.isTransposeOperation(in));
 				if( isSpark && !validNcol ) {
 					List<MemoTableEntry> blacklist = memo.get(hopID, TemplateType.ROW);
-					memo.remove(memo.getHopRefs().get(hopID), new HashSet<>(blacklist));
-					if( !memo.contains(hopID) )
-						memo.removeAllRefTo(hopID);
+					memo.remove(memo.getHopRefs().get(hopID), TemplateType.ROW);
+					memo.removeAllRefTo(hopID, TemplateType.ROW);
 					if( LOG.isTraceEnabled() ) {
 						LOG.trace("Removed row memo table entries w/ violated blocksize constraint ("+hopID+"): "
 							+ Arrays.toString(blacklist.toArray(new MemoTableEntry[0])));
@@ -625,10 +626,11 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		}
 		
 		//prune row aggregates with pure cellwise operations
+		HashSet<Long> refAggs = getRowAggOpsWithRowRef(memo, part);
 		for( Long hopID : part.getPartition() ) {
 			MemoTableEntry me = memo.getBest(hopID, TemplateType.ROW);
 			if( me != null && me.type == TemplateType.ROW && memo.contains(hopID, TemplateType.CELL)
-				&& isRowTemplateWithoutAggOrVects(memo, memo.getHopRefs().get(hopID), new HashSet<Long>())) {
+				&& rIsRowTemplateWithoutAggOrVects(memo, memo.getHopRefs().get(hopID), new HashSet<Long>(), refAggs.contains(hopID)) ) {
 				List<MemoTableEntry> blacklist = memo.get(hopID, TemplateType.ROW); 
 				memo.remove(memo.getHopRefs().get(hopID), new HashSet<>(blacklist));
 				if( LOG.isTraceEnabled() ) {
@@ -698,28 +700,25 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 		//i.e., plans that become invalid after the previous pruning step
 		long hopID = current.getHopID();
 		if( part.getPartition().contains(hopID) && memo.contains(hopID, TemplateType.ROW) ) {
-			for( MemoTableEntry me : memo.get(hopID) ) {
-				if( me.type==TemplateType.ROW ) {
-					//convert leaf node with pure vector inputs
-					if( !me.hasPlanRef() && !TemplateUtils.hasMatrixInput(current) ) {
+			for( MemoTableEntry me : memo.get(hopID, TemplateType.ROW) ) {
+				//convert leaf node with pure vector inputs
+				if( !me.hasPlanRef() && !TemplateUtils.hasMatrixInput(current) ) {
+					me.type = TemplateType.CELL;
+					if( LOG.isTraceEnabled() )
+						LOG.trace("Converted leaf memo table entry from row to cell: "+me);
+				}
+				
+				//convert inner node without row template input
+				if( me.hasPlanRef() && !ROW_TPL.open(current) ) {
+					boolean hasRowInput = false;
+					for( int i=0; i<3; i++ )
+						if( me.isPlanRef(i) )
+							hasRowInput |= memo.contains(me.input(i), TemplateType.ROW);
+					if( !hasRowInput ) {
 						me.type = TemplateType.CELL;
 						if( LOG.isTraceEnabled() )
-							LOG.trace("Converted leaf memo table entry from row to cell: "+me);
-					}
-					
-					//convert inner node without row template input
-					if( me.hasPlanRef() && !ROW_TPL.open(current) ) {
-						boolean hasRowInput = false;
-						for( int i=0; i<3; i++ )
-							if( me.isPlanRef(i) )
-								hasRowInput |= memo.contains(me.input(i), TemplateType.ROW);
-						if( !hasRowInput ) {
-							me.type = TemplateType.CELL;
-							if( LOG.isTraceEnabled() )
-								LOG.trace("Converted inner memo table entry from row to cell: "+me);	
-						}
+							LOG.trace("Converted inner memo table entry from row to cell: "+me);	
 					}
-					
 				}
 			}
 		}
@@ -834,14 +833,16 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 				String type = (best !=null) ? best.type.name() : "HOP";
 				LOG.trace("Cost vector ("+type+" "+currentHopId+"): "+costVect);
 			}
-			double tmpCosts = costVect.outSize * 8 / WRITE_BANDWIDTH //time for output write
-				+ Math.max(costVect.getSumInputSizes() * 8 / READ_BANDWIDTH,
-				costVect.computeCosts*costVect.getMaxInputSize()/ COMPUTE_BANDWIDTH);
+			double tmpCosts = costVect.outSize * 8 / WRITE_BANDWIDTH
+				+ Math.max(costVect.getInputSize() * 8 / READ_BANDWIDTH,
+				costVect.computeCosts/ COMPUTE_BANDWIDTH);
+			//read correction for distributed computation
+			Hop driver = memo.getHopRefs().get(costVect.getMaxInputSizeHopID());
+			if( driver.getMemEstimate() > OptimizerUtils.getLocalMemBudget() )
+				tmpCosts += costVect.getSideInputSize() * 8 / READ_BANDWIDTH_BROADCAST;
 			//sparsity correction for outer-product template (and sparse-safe cell)
-			if( best != null && best.type == TemplateType.OUTER ) {
-				Hop driver = memo.getHopRefs().get(costVect.getMaxInputSizeHopID());
+			if( best != null && best.type == TemplateType.OUTER )
 				tmpCosts *= driver.dimsKnown(true) ? driver.getSparsity() : SPARSE_SAFE_SPARSITY_EST;
-			}
 			costs += tmpCosts;
 		}
 		//add costs for non-partition read in the middle of fused operator
@@ -978,12 +979,9 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			costs = 1;
 		}
 		else if( current instanceof AggBinaryOp ) {
-			//outer product template
-			if( HopRewriteUtils.isOuterProductLikeMM(current) )
-				costs = 2 * current.getInput().get(0).getDim2();
-			//row template w/ matrix-vector or matrix-matrix
-			else
-				costs = 2 * current .getDim2();
+			//outer product template w/ matrix-matrix 
+			//or row template w/ matrix-vector or matrix-matrix
+			costs = 2 * current.getInput().get(0).getDim2();
 		}
 		else if( current instanceof AggUnaryOp) {
 			switch(((AggUnaryOp)current).getOp()) {
@@ -993,10 +991,15 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			case MAX:    costs = 1; break;
 			default:
 				LOG.warn("Cost model not "
-					+ "implemented yet for: "+((AggUnaryOp)current).getOp());			
+					+ "implemented yet for: "+((AggUnaryOp)current).getOp());
 			}
 		}
 		
+		//scale by current output size in order to correctly reflect
+		//a mix of row and cell operations in the same fused operator
+		//(e.g., row template with fused column vector operations)
+		costs *= getSize(current);
+		
 		computeCosts.put(current.getHopID(), costs);
 	}
 	
@@ -1025,8 +1028,14 @@ public class PlanSelectionFuseCostBasedV2 extends PlanSelection
 			//ensures that input sizes are not double counted
 			inSizes.put(hopID, inputSize);
 		}
-		public double getSumInputSizes() {
+		public double getInputSize() {
+			return inSizes.values().stream()
+				.mapToDouble(d -> d.doubleValue()).sum();
+		}
+		public double getSideInputSize() {
+			double max = getMaxInputSize();
 			return inSizes.values().stream()
+				.filter(d -> d < max)
 				.mapToDouble(d -> d.doubleValue()).sum();
 		}
 		public double getMaxInputSize() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/311e4aac/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index 99ffc8d..5eedc7b 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -95,11 +95,10 @@ public class CPlanMemoTable
 			.anyMatch(p -> (!checkClose||!p.isClosed()) && probe.contains(p.type));
 	}
 	
-	public boolean containsNotIn(long hopID, Collection<TemplateType> types, 
-		boolean checkChildRefs, boolean excludeCell) {
+	public boolean containsNotIn(long hopID, 
+		Collection<TemplateType> types, boolean checkChildRefs) {
 		return contains(hopID) && get(hopID).stream()
-			.anyMatch(p -> (!checkChildRefs || p.hasPlanRef()) 
-				&& (!excludeCell || p.type!=TemplateType.CELL)
+			.anyMatch(p -> (!checkChildRefs || p.hasPlanRef())
 				&& p.isValid() && !types.contains(p.type));
 	}
 	
@@ -153,14 +152,22 @@ public class CPlanMemoTable
 			.removeIf(p -> blackList.contains(p));
 	}
 	
+	public void remove(Hop hop, TemplateType type) {
+		_plans.get(hop.getHopID())
+			.removeIf(p -> p.type == type);
+	}
+	
 	public void removeAllRefTo(long hopID) {
+		removeAllRefTo(hopID, null);
+	}
+	
+	public void removeAllRefTo(long hopID, TemplateType type) {
 		//recursive removal of references
 		for( Entry<Long, List<MemoTableEntry>> e : _plans.entrySet() ) {
-			if( !e.getValue().isEmpty() ) {
-				e.getValue().removeIf(p -> p.hasPlanRefTo(hopID));
-				if( e.getValue().isEmpty() )
-					removeAllRefTo(e.getKey());
-			}
+			if( e.getValue().isEmpty() || e.getKey()==hopID ) 
+				continue;
+			e.getValue().removeIf(p -> p.hasPlanRefTo(hopID)
+				&& (type==null || p.type==type));
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/311e4aac/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 7624d96..91fde5e 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -1788,11 +1788,13 @@ public class LibSpoofPrimitives
 	//dynamic memory management
 	
 	public static void setupThreadLocalMemory(int numVectors, int len) {
-		setupThreadLocalMemory(numVectors, len, -1);
+		if( numVectors > 0 )
+			setupThreadLocalMemory(numVectors, len, -1);
 	}
 	
 	public static void setupThreadLocalMemory(int numVectors, int len, int len2) {
-		memPool.set(new VectorBuffer(numVectors, len, len2));
+		if( numVectors > 0 )
+			memPool.set(new VectorBuffer(numVectors, len, len2));
 	}
 	
 	public static void cleanupThreadLocalMemory() {