You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/12 08:12:43 UTC

[1/5] systemml git commit: [SYSTEMML-1955] Memory-efficient ultra-sparse matrix multiplication

Repository: systemml
Updated Branches:
  refs/heads/master 426d7fa0d -> a97bc53f7


[SYSTEMML-1955] Memory-efficient ultra-sparse matrix multiplication

So far the codepath for ultra-sparse matrix multiply was only used if
any of the two inputs classified as ultra-sparse in terms of its
sparsity and absolute number of non-zeros. For special cases of
permutation matrix multiply with large ultra-sparse matrices this is not
the case leading to sparse-dense matrix multiplication which allocates
the output in dense format. 

This patch improves the code path selection for ultra-sparse matrix
multiplication to include such permutation matrix multiplies, which
allocates the output in sparse format and thus is much more memory
efficient. Additionally, it also improves performance by avoiding
repeated reallocations. 

For example, on a scenario of 58825360 x 2519371 (nnz=58825360) times
2519371 x 300 (nnz=755810998), this patch enables the successful
execution in CP (before it crashed due to allocating a dense output
larger than 16GB). By avoiding reallocations, this patch further
improved the execution time from 112s to 24s due to significantly
reduced GC overhead.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0b5480b7
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0b5480b7
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0b5480b7

Branch: refs/heads/master
Commit: 0b5480b77de3391bec0ee1e60ab7d71de80e9605
Parents: 426d7fa
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Oct 11 14:20:00 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:05 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 88 +++++++++++---------
 .../sysml/runtime/matrix/data/MatrixBlock.java  | 22 +++--
 2 files changed, 66 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/0b5480b7/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 0885508..aedf975 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -29,6 +29,7 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
 import org.apache.commons.math3.util.FastMath;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.MapMultChain.ChainType;
 import org.apache.sysml.lops.WeightedCrossEntropy.WCeMMType;
 import org.apache.sysml.lops.WeightedDivMM.WDivMMType;
@@ -111,19 +112,20 @@ public class LibMatrixMult
 	
 	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) 
 		throws DMLRuntimeException
-	{	
+	{
 		//check inputs / outputs
 		if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-			
+		
 		//Timing time = new Timing(true);
-			
+		
 		//pre-processing: output allocation
+		boolean ultraSparse = isUltraSparseMatrixMult(m1, m2);
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
-		ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
+		ret.sparse = ultraSparse;
 		if( !ret.sparse )
 			ret.allocateDenseBlock();
 		
@@ -133,7 +135,7 @@ public class LibMatrixMult
 		int cu = m2.clen;
 		
 		//core matrix mult computation
-		if( m1.isUltraSparse() || m2.isUltraSparse() )
+		if( ultraSparse )
 			matrixMultUltraSparse(m1, m2, ret, 0, ru2);
 		else if(!m1.sparse && !m2.sparse)
 			matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu);
@@ -147,13 +149,11 @@ public class LibMatrixMult
 		//post-processing: nnz/representation
 		if( !ret.sparse )
 			ret.recomputeNonZeros();
-		
 		if(examSparsity)
 			ret.examSparsity();
 		
-		
 		//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
-		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
+		//		"("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}
 	
 	/**
@@ -168,13 +168,13 @@ public class LibMatrixMult
 	 */
 	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) 
 		throws DMLRuntimeException
-	{	
+	{
 		//check inputs / outputs
 		if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-			
+		
 		//check too high additional vector-matrix memory requirements (fallback to sequential)
 		//check too small workload in terms of flops (fallback to sequential too)
 		if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse()) 
@@ -188,15 +188,13 @@ public class LibMatrixMult
 		
 		//pre-processing: output allocation (in contrast to single-threaded,
 		//we need to allocate sparse as well in order to prevent synchronization)
+		boolean ultraSparse = isUltraSparseMatrixMult(m1, m2);
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
-		ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
-		if( !ret.sparse )
-			ret.allocateDenseBlock();
-		else
-			ret.allocateSparseRowsBlock();
+		ret.sparse = ultraSparse;
+		ret.allocateBlock();
 		
-		if (!ret.isThreadSafe()){
+		if (!ret.isThreadSafe()) {
 			matrixMult(m1, m2, ret);
 			return;
 		}
@@ -216,7 +214,7 @@ public class LibMatrixMult
 			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
 				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, lb, lb+blklens.get(i)));
 			//execute tasks
-			List<Future<Object>> taskret = pool.invokeAll(tasks);	
+			List<Future<Object>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
 			//aggregate partial results (nnz, ret for vector/matrix)
 			ret.nonZeros = 0; //reset after execute
@@ -233,12 +231,11 @@ public class LibMatrixMult
 			throw new DMLRuntimeException(ex);
 		}
 		
-		
 		//post-processing (nnz maintained in parallel)
 		ret.examSparsity();
 		
 		//System.out.println("MM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
-		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
+		//		"("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}
 	
 	/**
@@ -752,7 +749,7 @@ public class LibMatrixMult
 		try 
 		{			
 			ExecutorService pool = Executors.newFixedThreadPool(k);
-			ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>();			
+			ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>();
 			//create tasks (for wdivmm-left, parallelization over columns;
 			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
 			if( wt.isLeft() ) {
@@ -827,7 +824,7 @@ public class LibMatrixMult
 		ret.allocateDenseBlock();
 		
 		try 
-		{			
+		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<MatrixMultWCeTask> tasks = new ArrayList<MatrixMultWCeTask>();
 			int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -1476,9 +1473,8 @@ public class LibMatrixMult
 	private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) 
 		throws DMLRuntimeException 
 	{
-		//TODO perf sparse block, consider iterators
-		
-		boolean leftUS = m1.isUltraSparse();
+		final boolean leftUS = m1.isUltraSparse()
+			|| (m1.isUltraSparse(false) && !m2.isUltraSparse());
 		final int m  = m1.rlen;
 		final int cd = m1.clen;
 		final int n  = m2.clen;
@@ -1486,6 +1482,7 @@ public class LibMatrixMult
 		if( leftUS ) //left is ultra-sparse (IKJ)
 		{
 			SparseBlock a = m1.sparseBlock;
+			SparseBlock c = ret.sparseBlock;
 			boolean rightSparse = m2.sparse;
 			
 			for( int i=rl; i<ru; i++ )
@@ -1504,13 +1501,19 @@ public class LibMatrixMult
 							if( !m2.sparseBlock.isEmpty(aix) ) {
 								ret.rlen=m;
 								ret.allocateSparseRowsBlock(false); //allocation on demand
-								ret.sparseBlock.set(i, m2.sparseBlock.get(aix), true); 
+								boolean ldeep = (m2.sparseBlock instanceof SparseBlockMCSR);
+								ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep);
 								ret.nonZeros += ret.sparseBlock.size(i);
 							}
 						}
 						else { //dense right matrix (append all values)
-							for( int j=0; j<n; j++ )
-								ret.appendValue(i, j, m2.quickGetValue(aix, j));
+							int lnnz = (int)m2.recomputeNonZeros(aix, aix, 0, n-1);
+							if( lnnz > 0 ) {
+								c.allocate(i, lnnz); //allocate once
+								for( int j=0; j<n; j++ )
+									c.append(i, j, m2.quickGetValue(aix, j));
+								ret.nonZeros += lnnz;
+							}
 						}
 					}
 					else //GENERAL CASE
@@ -1536,13 +1539,13 @@ public class LibMatrixMult
 			SparseBlock b = m2.sparseBlock;
 			
 			for(int k = 0; k < cd; k++ ) 
-			{			
+			{
 				if( !b.isEmpty(k) ) 
 				{
 					int bpos = b.pos(k);
 					int blen = b.size(k);
 					int[] bixs = b.indexes(k);
-					double[] bvals = b.values(k);								
+					double[] bvals = b.values(k);
 					for( int j=bpos; j<bpos+blen; j++ )
 					{
 						double bval = bvals[j];
@@ -3552,6 +3555,14 @@ public class LibMatrixMult
 				&& m2.clen > k * 1024 && m1.rlen < k * 32 && !pm2r
 				&& 8*m1.rlen*m1.clen < 256*1024 ); //lhs fits in L2 cache
 	}
+	
+	public static boolean isUltraSparseMatrixMult(MatrixBlock m1, MatrixBlock m2) {
+		//note: ultra-sparse matrix mult implies also sparse outputs, hence we need
+		//to be conservative an cannot use this for all ultra-sparse matrices.
+		return (m1.isUltraSparse() || m2.isUltraSparse()) //base case
+			|| (m1.isUltraSparsePermutationMatrix() 
+				&& OptimizerUtils.getSparsity(m2.rlen, m2.clen, m2.nonZeros)<1.0);
+	}
 
 	private static MatrixBlock prepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) 
 		throws DMLRuntimeException
@@ -3643,17 +3654,16 @@ public class LibMatrixMult
 
 	private static class MatrixMultTask implements Callable<Object> 
 	{
-		private MatrixBlock _m1  = null;
-		private MatrixBlock _m2  = null;
+		private final MatrixBlock _m1;
+		private final MatrixBlock _m2;
 		private MatrixBlock _ret = null;
-		private boolean _tm2 = false; //transposed m2
-		private boolean _pm2r = false; //par over m2 rows
-		private boolean _pm2c = false; //par over m2 rows
-		
-		private int _rl = -1;
-		private int _ru = -1;
+		private final boolean _tm2; //transposed m2
+		private final boolean _pm2r; //par over m2 rows
+		private final boolean _pm2c; //par over m2 rows
+		private final int _rl;
+		private final int _ru;
 
-		protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, 
+		protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
 				boolean tm2, boolean pm2r, boolean pm2c, int rl, int ru )
 		{
 			_m1 = m1;
@@ -3687,7 +3697,7 @@ public class LibMatrixMult
 				_ret.allocateDenseBlock();
 			
 			//compute block matrix multiplication
-			if( _m1.isUltraSparse() || _m2.isUltraSparse() )
+			if( _ret.sparse ) //ultra-sparse
 				matrixMultUltraSparse(_m1, _m2, _ret, rl, ru);
 			else if(!_m1.sparse && !_m2.sparse)
 				matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);

http://git-wip-us.apache.org/repos/asf/systemml/blob/0b5480b7/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index ff05fa0..8f9ae9e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -916,16 +916,28 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * 
 	 * @return true if sparse
 	 */
-	public boolean isInSparseFormat()
-	{
+	public boolean isInSparseFormat() {
 		return sparse;
 	}
+	
+	public boolean isUltraSparse() {
+		return isUltraSparse(true);
+	}
 
-	public boolean isUltraSparse()
-	{
+	public boolean isUltraSparse(boolean checkNnz) {
 		double sp = ((double)nonZeros/rlen)/clen;
 		//check for sparse representation in order to account for vectors in dense
-		return sparse && sp<ULTRA_SPARSITY_TURN_POINT && nonZeros<40;
+		return sparse && sp<ULTRA_SPARSITY_TURN_POINT && (!checkNnz || nonZeros<40);
+	}
+	
+	public boolean isUltraSparsePermutationMatrix() {
+		if( !isUltraSparse(false) )
+			return false;
+		boolean isPM = true;
+		SparseBlock sblock = getSparseBlock();
+		for( int i=0; i<rlen & isPM; i++ )
+			isPM &= sblock.isEmpty(i) || sblock.size(i) == 1;
+		return isPM;
 	}
 
 	/**


[5/5] systemml git commit: [SYSTEMML-1954] Memory-efficient rexpand output (CSR instead of MCSR)

Posted by mb...@apache.org.
[SYSTEMML-1954] Memory-efficient rexpand output (CSR instead of MCSR)

The rexpand operator is compiled for operations such as table(seq(1,N),
v) and (for the case of seq on the lhs), simply horizontally expands the
given value into the respective column index. For large N, this creates
a huge but ultra-sparse matrix. In past we introduced scalar rows for
the MCSR sparse block to reduce the memory overhead of sparse rows with
a single entry.

However, this approach was still memory-inefficient and causes
unnecessary GC overhead. This patch now creates the output for rexpand
columns in CSR instead of MCSR representation. In order to avoid
unnecessary CSR row pointer updates on append, we also provide the CSR
primitives to construct the block directly from a given column index
array. On a scenario of rexpand 60M rows, this patch improved the
rexpand performance from 39.4s to 3.8s with additional end-to-end
improvements due to reduce GC overhead which also affects subsequent
operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a97bc53f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a97bc53f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a97bc53f

Branch: refs/heads/master
Commit: a97bc53f7c2ab74374733376e0137174f821bd95
Parents: a8f6a3c
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Oct 12 01:05:03 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:09 2017 -0700

----------------------------------------------------------------------
 .../data/LibMatrixCuDNNInputRowFetcher.java     |  2 +-
 .../runtime/matrix/data/LibMatrixReorg.java     | 28 +++++++++++++++++---
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  6 ++---
 .../runtime/matrix/data/SparseBlockCSR.java     | 27 +++++++++++++++++++
 4 files changed, 55 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a97bc53f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
index b9619c8..a21dae0 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java
@@ -56,7 +56,7 @@ public class LibMatrixCuDNNInputRowFetcher implements java.lang.AutoCloseable {
 	 * Copy the nth row and return the dense pointer
 	 * @param n zero-based row index
 	 * @return dense pointer containing the nth row. This row is reused in the next iteration
-	 * @throws DMLRuntimeException
+	 * @throws DMLRuntimeException ?
 	 */
 	public Pointer getNthRow(int n) throws DMLRuntimeException {
 		if(isInputInSparseFormat) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/a97bc53f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 0bd4402..d2d3d86 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -71,6 +71,9 @@ public class LibMatrixReorg
 	//allow reuse of temporary blocks for certain operations
 	public static final boolean ALLOW_BLOCK_REUSE = false;
 	
+	//use csr instead of mcsr sparse block for rexpand columns
+	public static final boolean REXPAND_COLS_IN_CSR = true;
+	
 	private enum ReorgType {
 		TRANSPOSE,
 		REV,
@@ -1907,7 +1910,7 @@ public class LibMatrixReorg
 		
 		return ret;
 	}
-
+	
 	private static MatrixBlock rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int k) 
 		throws DMLRuntimeException
 	{
@@ -1921,7 +1924,8 @@ public class LibMatrixReorg
 		
 		//execute rexpand columns
 		long rnnz = 0; //real nnz (due to cutoff max)
-		if( k <= 1 || in.getNumRows() <= PAR_NUMCELL_THRESHOLD ) {
+		if( k <= 1 || in.getNumRows() <= PAR_NUMCELL_THRESHOLD
+			|| (sp && REXPAND_COLS_IN_CSR) ) {
 			rnnz = rexpandColumns(in, ret, max, cast, ignore, 0, rlen);
 		}
 		else {
@@ -1951,6 +1955,14 @@ public class LibMatrixReorg
 	private static long rexpandColumns(MatrixBlock in, MatrixBlock ret, int max, boolean cast, boolean ignore, int rl, int ru) 
 		throws DMLRuntimeException
 	{
+		//initialize auxiliary data structures 
+		int lnnz = 0;
+		int[] cix = null;
+		if( ret.sparse && REXPAND_COLS_IN_CSR ) {
+			cix = new int[in.rlen];
+			Arrays.fill(cix, -1);
+		}
+		
 		//expand input horizontally (input vector likely dense 
 		//but generic implementation for general case)
 		for( int i=rl; i<ru; i++ )
@@ -1967,17 +1979,25 @@ public class LibMatrixReorg
 			//set expanded value if matching
 			if( val == Math.floor(val) && val >= 1 && val <= max ) {
 				//update target without global nnz maintenance
-				if( ret.isInSparseFormat() ) {
+				if( cix != null ) {
+					cix[i] = (int)(val-1);
+				}
+				else if( ret.isInSparseFormat() ) {
 					ret.sparseBlock.allocate(i, 1);
 					ret.sparseBlock.append(i, (int)(val-1), 1);
 				}
 				else
 					ret.setValueDenseUnsafe(i, (int)(val-1), 1);
+				lnnz ++;
 			}
 		}
 		
+		//init CSR block once to avoid repeated updates of row pointers on append
+		if( cix != null )
+			ret.sparseBlock = new SparseBlockCSR(in.rlen, lnnz, cix);
+		
 		//recompute nnz of partition
-		return ret.recomputeNonZeros(rl, ru-1, 0, ret.getNumColumns()-1);
+		return ret.setNonZeros(lnnz);
 	}
 	
 	private static void copyColVector( MatrixBlock in, int ixin, double[] tmp, int[] tmpi, int len)

http://git-wip-us.apache.org/repos/asf/systemml/blob/a97bc53f/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 8f9ae9e..822c6f9 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -463,8 +463,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		return nonZeros;
 	}
 	
-	public void setNonZeros(long nnz) {
-		nonZeros = nnz;
+	public long setNonZeros(long nnz) {
+		return (nonZeros = nnz);
 	}
 	
 	public boolean isVector() {
@@ -4810,7 +4810,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * This function computes the total weight
 	 * 
 	 * @return sum weight for quantile
-	 * @throws DMLRuntimeException
+	 * @throws DMLRuntimeException on error
 	 */
 	public double sumWeightForQuantile() 
 		throws DMLRuntimeException 

http://git-wip-us.apache.org/repos/asf/systemml/blob/a97bc53f/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
index bc817c7..35ffedf 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SparseBlockCSR.java
@@ -166,6 +166,33 @@ public class SparseBlockCSR extends SparseBlock
 	}
 	
 	/**
+	 * Copy constructor for given array of column indexes, which
+	 * identifies rows by position and implies values of 1.
+	 * 
+	 * @param rows number of rows
+	 * @param nnz number of non-zeros
+	 * @param colInd column indexes
+	 */
+	public SparseBlockCSR(int rows, int nnz, int[] colInd) {
+		_ptr = new int[rows+1];
+		_indexes = (rows==nnz) ? colInd : new int[nnz];
+		_values = new double[nnz];
+		Arrays.fill(_values, 1);
+		_size = nnz;
+		
+		//single-pass construction of row pointers
+		//and copy of column indexes if necessary
+		for(int i=0, pos=0; i<rows; i++) {
+			if( colInd[i] >= 0 ) {
+				if( rows > nnz )
+					_indexes[pos] = colInd[i];
+				pos++;
+			}
+			_ptr[i+1] = pos;
+		}
+	}
+	
+	/**
 	 * Initializes the CSR sparse block from an ordered input
 	 * stream of ultra-sparse ijv triples. 
 	 * 


[2/5] systemml git commit: [SYSTEMML-1956] Performance sparse N:1 matrix-matrix reshape

Posted by mb...@apache.org.
[SYSTEMML-1956] Performance sparse N:1 matrix-matrix reshape

This patch improves performance for the special case of reshaping a
sparse m x n matrix into m2 x n2 where n%n2==0. In this case, we can
easily determine the number of non-zeros per target row from the input
row block, which allows to allocate the output once without unnecessary
repeated allocations. 

On a 58825360 x 300 sparse input, this patch improved performance from
79s to 30.6s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/92bad9ee
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/92bad9ee
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/92bad9ee

Branch: refs/heads/master
Commit: 92bad9ee3fc5389124faad0eb61eda748871e102
Parents: 0b5480b
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Oct 11 15:39:41 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:06 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixReorg.java     | 24 ++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/92bad9ee/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index c64bbc6..0bd4402 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -1161,7 +1161,7 @@ public class LibMatrixReorg
 			// * vector-matrix not really different from general
 			// * clen=1 and cols=1 will never be sparse.
 			
-			if( rows==1 ) //MATRIX->VECTOR	
+			if( rows==1 ) //MATRIX->VECTOR
 			{
 				//note: cache-friendly on a and c; append-only
 				c.allocate(0, estnnz, cols);
@@ -1177,6 +1177,26 @@ public class LibMatrixReorg
 					}
 				}
 			}
+			else if( cols%clen==0 ) { //SPECIAL N:1 MATRIX->MATRIX
+				int n = cols/clen;
+				for(int bi=0, ci=0; bi<rlen; bi+=n, ci++) {
+					//allocate output row once (w/o re-allocations)
+					long lnnz = a.size(bi, bi+n);
+					c.allocate(ci, (int)lnnz);
+					//copy N input rows into output row
+					for( int i=bi, cix=0; i<bi+n; i++, cix+=clen ) {
+						if(a.isEmpty(i)) continue;
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for( int j=apos; j<apos+alen; j++ ) {
+							int cj = (int)((cix+aix[j])%cols);
+							c.append(ci, cj, avals[j]);
+						}
+					}
+				}
+			}
 			else //GENERAL CASE: MATRIX->MATRIX
 			{
 				//note: cache-friendly on a but not c; append-only
@@ -1193,7 +1213,7 @@ public class LibMatrixReorg
 						for( int j=apos; j<apos+alen; j++ )
 						{
 							int ci = (int)((cix+aix[j])/cols);
-							int cj = (int)((cix+aix[j])%cols);       
+							int cj = (int)((cix+aix[j])%cols);
 							c.allocate(ci, estnnz, cols);
 							c.append(ci, cj, avals[j]);
 						}


[3/5] systemml git commit: [SYSTEMML-1956] Fix robustness frame rbind w/ mismatching schemas

Posted by mb...@apache.org.
[SYSTEMML-1956] Fix robustness frame rbind w/ mismatching schemas

This patch makes the frame rbind more robust by allowing graceful schema
conversions. We now try - in a best effort manner - to convert the
values of the second input into the types of the first input, which
determine the schema of the output. 


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/deb4baf0
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/deb4baf0
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/deb4baf0

Branch: refs/heads/master
Commit: deb4baf06c3a9d204523dc868f72ea23e307f4c4
Parents: 92bad9e
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Oct 11 17:12:31 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:07 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/FrameBlock.java   | 31 ++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/deb4baf0/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index 6079956..23dbd8b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -554,6 +554,20 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	}
 	
 	/**
+	 * Get a row iterator over the frame where all fields are encoded
+	 * as boxed objects according to the value types of the provided
+	 * target schema.
+	 * 
+	 * @param schema target schema of objects
+	 * @return object array iterator
+	 */
+	public Iterator<Object[]> getObjectRowIterator(ValueType[] schema) {
+		ObjectRowIterator iter = new ObjectRowIterator(0, _numRows);
+		iter.setSchema(schema);
+		return iter;
+	}
+	
+	/**
 	 * Get a row iterator over the frame where all selected fields are 
 	 * encoded as boxed objects according to their value types.  
 	 * 
@@ -992,7 +1006,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 			ret._coldata = new Array[getNumColumns()];
 			for( int j=0; j<getNumColumns(); j++ )
 				ret._coldata[j] = _coldata[j].clone();
-			Iterator<Object[]> iter = that.getObjectRowIterator();
+			Iterator<Object[]> iter = that.getObjectRowIterator(_schema);
 			while( iter.hasNext() )
 				ret.appendRow(iter.next());
 		}
@@ -1221,6 +1235,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	}
 
 	private class ObjectRowIterator extends RowIterator<Object> {
+		private ValueType[] _tgtSchema = null;
+		
 		public ObjectRowIterator(int rl, int ru) {
 			super(rl, ru);
 		}
@@ -1229,6 +1245,10 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 			super(rl, ru, cols);
 		}
 		
+		public void setSchema(ValueType[] schema) {
+			_tgtSchema = schema;
+		}
+		
 		@Override
 		protected Object[] createRow(int size) {
 			return new Object[size];
@@ -1237,10 +1257,17 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		@Override
 		public Object[] next( ) {
 			for( int j=0; j<_cols.length; j++ )
-				_curRow[j] = get(_curPos, _cols[j]-1);
+				_curRow[j] = getValue(_curPos, _cols[j]-1);
 			_curPos++;
 			return _curRow;
 		}
+		
+		private Object getValue(int i, int j) {
+			Object val = get(i, j);
+			if( _tgtSchema != null )
+				val = UtilFunctions.objectToObject(_tgtSchema[j], val);
+			return val;
+		}
 	}
 	
 	///////


[4/5] systemml git commit: [SYSTEMML-1958] Performance sparse conv2d and dense-sparse mm (minor)

Posted by mb...@apache.org.
[SYSTEMML-1958] Performance sparse conv2d and dense-sparse mm (minor)

This patch makes two minor performance improvement, namely the
preallocation of matrix multiplication output blocks in sparse conv2d
(to amortize the allocation cost across images), and minor cleanups in
the core dense-sparse block matrix multiply. However, this dense-sparse
mm operations requires further improvement, but initial attempt were
unsuccessful. Finally, this patch also includes a fix for the recently
modified ultra-sparse mm code path.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a8f6a3cb
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a8f6a3cb
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a8f6a3cb

Branch: refs/heads/master
Commit: a8f6a3cb68af2cfae618fc0c3411997f2ab3ec83
Parents: deb4baf
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Oct 11 23:40:39 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:08 2017 -0700

----------------------------------------------------------------------
 .../matrix/data/LibMatrixDNNConv2dHelper.java   |  46 +++---
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 146 ++++++++-----------
 .../runtime/matrix/data/LibMatrixMult.java      |  78 +++++-----
 3 files changed, 117 insertions(+), 153 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index 2685f52..1036af7 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -128,8 +128,10 @@ public class LibMatrixDNNConv2dHelper {
 		@Override
 		public Long call() throws Exception {
 			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
-			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
-			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true);
+			MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false);
+			MatrixBlock outMM = new MatrixBlock(K, PQ, false);
+			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
+					LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, outIm2col, _params, true);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// im2col(input) => _im2ColOutBlock
@@ -138,8 +140,8 @@ public class LibMatrixDNNConv2dHelper {
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
 				// filter %*% _im2ColOutBlock => matMultOutBlock
-				MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false);
-				LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, im2ColOutBlock, matMultOutBlock, false, true, _params);
+				outMM.reset(outMM.rlen, outMM.clen, false);
+				LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
@@ -148,7 +150,7 @@ public class LibMatrixDNNConv2dHelper {
 				}
 				
 				// Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos
-				partialCopy1(matMultOutBlock, _params.output.getDenseBlock(), n*K*PQ, K, PQ);
+				partialCopy1(outMM, _params.output.getDenseBlock(), n*K*PQ, K, PQ);
 			}
 			if(_params.bias != null) {
 				// bias is always converted to dense format
@@ -165,27 +167,23 @@ public class LibMatrixDNNConv2dHelper {
 		private static void partialCopy1(MatrixBlock src, double [] dest, int destPos, int K, int PQ) {
 			// Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware.
 			// This is not required in Native implementation
-			if(!src.isEmptyBlock()) {
-				if(src.isInSparseFormat()) {
-					// Copy the sparse matrix matMultOutBlock of shape [K X PQ] to 
-					// params.output.denseBlock + destPos
-					for(int k = 0; k < src.getNumRows(); k++) {
-						if( !src.sparseBlock.isEmpty(k) ) {
-							int apos = src.sparseBlock.pos(k);
-							int alen = src.sparseBlock.size(k);
-							int[] aix = src.sparseBlock.indexes(k);
-							double[] avals = src.sparseBlock.values(k);
-							int desPosK = destPos + k*PQ;
-							for(int j = apos; j < apos+alen; j++) {
-								int pqIndex = aix[j];
-								dest[desPosK + pqIndex ] = avals[j];
-							}
-						}
-					}
+			if( src.isEmptyBlock() )
+				return;
+			if(src.isInSparseFormat()) {
+				SparseBlock sblock = src.sparseBlock;
+				for(int k = 0; k < src.getNumRows(); k++) {
+					if( sblock.isEmpty(k) ) continue;
+					int apos = sblock.pos(k);
+					int alen = sblock.size(k);
+					int[] aix = sblock.indexes(k);
+					double[] avals = sblock.values(k);
+					int desPosK = destPos + k*PQ;
+					for(int j = apos; j < apos+alen; j++)
+						dest[desPosK+aix[j]] = avals[j];
 				}
-				else 
-					System.arraycopy(src.denseBlock, 0, dest, destPos, K * PQ);
 			}
+			else 
+				System.arraycopy(src.denseBlock, 0, dest, destPos, K * PQ);
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index d427a26..3296c7f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -31,61 +31,38 @@ public class LibMatrixDNNIm2ColHelper {
 	static interface Im2colWorker {
 		public void execute(int n);
 		public void execute(int n, int c);
-		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean allChannels) {
-			if(allChannels) {
-				if(!input.isInSparseFormat()) {
-					// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
-					im2ColOutBlock.allocateDenseBlock();
-					if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0)  {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerStride1Pad0AllChannels operator to perform im2col.");
-						return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-					else {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform im2col.");
-						return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-				}
-				else {
-					if(LOG.isTraceEnabled()) LOG.trace("Using SparseIm2colWorkerAllChannels operator to perform im2col.");
-					double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
-					initializeSparseIm2ColBlock(im2ColOutBlock, (long)Math.ceil(params.P*params.Q*sparsity));
-					return new SparseSparseIm2colWorkerAllChannels(input, im2ColOutBlock, params);
-				}
+		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean allChannels) {
+			if(!input.isInSparseFormat()) {
+				boolean stride1Pad0 = params.stride_h == 1 && params.stride_w == 1 
+						&& params.pad_h == 0 && params.pad_w == 0;
+				// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
+				out.reset(out.rlen, out.clen, false);
+				out.allocateDenseBlock();
+				if( LOG.isTraceEnabled() ) 
+					LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform "
+						+ "im2col (stride1pad0="+stride1Pad0+", allChannels="+allChannels+").");
+				if(allChannels && stride1Pad0 )
+					return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
+				else if( allChannels )
+					return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
+				else if( stride1Pad0 )
+					return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), out.getDenseBlock(), params);
+				else
+					return new DenseIm2colWorker(input.getDenseBlock(), out.getDenseBlock(), params);
 			}
 			else {
-				if(!input.isInSparseFormat()) {
-					// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
-					im2ColOutBlock.allocateDenseBlock();
-					if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerStride1Pad0 operator to perform im2col.");
-						return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-					else {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorker operator to perform im2col.");
-						return new DenseIm2colWorker(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-				}
-				else {
-					if(LOG.isTraceEnabled()) LOG.trace("Using SparseIm2colWorker operator to perform im2col.");
-					double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
-					initializeSparseIm2ColBlock(im2ColOutBlock, (long)Math.ceil(params.P*params.Q*sparsity));
-					return new SparseSparseIm2colWorker(input, im2ColOutBlock, params);
-				}
-			}
-		}
-		
-		static void initializeSparseIm2ColBlock(MatrixBlock im2ColOutBlock, long worstCaseNNPerRow) {
-			if(worstCaseNNPerRow >= Integer.MAX_VALUE)
-				throw new RuntimeException("The dimension of intermediate im2col matrix exceeded:" + worstCaseNNPerRow);
-			// Set to sparse
-			im2ColOutBlock.sparse = true;
-			im2ColOutBlock.denseBlock = null;
-			im2ColOutBlock.allocateSparseRowsBlock();
-			
-			for(int r = 0; r < im2ColOutBlock.getNumRows(); r++) {
-				im2ColOutBlock.getSparseBlock().allocate(r, (int) worstCaseNNPerRow);
+				if(LOG.isTraceEnabled()) 
+					LOG.trace("Using SparseIm2colWorker operator to perform im2col.");
+				out.reset(out.rlen, out.clen, true);
+				out.allocateSparseRowsBlock();
+				//preallocate sparse-rows
+				double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, 
+					(input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
+				for(int r = 0; r < out.rlen; r++)
+					out.getSparseBlock().allocate(r, (int)Math.ceil(params.P*params.Q*sparsity));
+				
+				return new SparseSparseIm2colWorkerAllChannels(input, out, params);
 			}
-			im2ColOutBlock.setNonZeros(0);
 		}
 	}
 	
@@ -275,17 +252,15 @@ public class LibMatrixDNNIm2ColHelper {
 	/**
 	 * Performing sparse im2col for all channels for a given row n of the input matrix.
 	 */
-	static class SparseSparseIm2colWorkerAllChannels implements Im2colWorker {
-		MatrixBlock input;  MatrixBlock output;
-		int CRS; int S; int R; int P; int Q; int H; int W; int RS; int HW;
-		int stride_h; int stride_w; int pad_h; int pad_w;
+	private static class SparseSparseIm2colWorkerAllChannels implements Im2colWorker {
+		final MatrixBlock input, output;
+		final int S, R, P, Q, W, HW;
+		final int stride_h, stride_w, pad_h, pad_w;
 		public SparseSparseIm2colWorkerAllChannels(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
 			this.input = input;
 			this.output = im2ColOutBlock;
-			this.CRS = params.C * params.R * params.S;
-			this.RS = params.R * params.S;
 			this.HW = params.H * params.W;
-			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+			this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
 			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
 			if(!input.isInSparseFormat()) 
@@ -299,33 +274,32 @@ public class LibMatrixDNNIm2ColHelper {
 
 		@Override
 		public void execute(int n) {
-			if( !input.sparseBlock.isEmpty(n) ) {
-				output.sparseBlock.reset();
-				output.setNonZeros(0);
-				int apos = input.sparseBlock.pos(n);
-				int alen = input.sparseBlock.size(n);
-				int[] aix = input.sparseBlock.indexes(n);
-				double[] avals = input.sparseBlock.values(n);
-				
-				// Iterate over the sparse block
-				for(int j=apos; j<apos+alen; j++) {
-					// Note: the input is of shape [N, CHW]
-					int chw = aix[j];
-					
-					// Get individual zero-based c,h,w indexes from zero-based 'chw'
-					int cInput = chw / HW;
-					int hInput = (chw - cInput*HW)/W;
-					int wInput = chw % W; 
-					
-					appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
-							R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
-				}
-				// Since the chw are appended in sorted order, no need to sort the output rows
-				// if(meta.sortRows) output.sortSparseRows();
+			output.reset();
+			
+			SparseBlock sblock = input.sparseBlock;
+			if( sblock.isEmpty(n) ) {
+				return;
 			}
-			else {
-				output.setNonZeros(0);
+			
+			int apos = input.sparseBlock.pos(n);
+			int alen = input.sparseBlock.size(n);
+			int[] aix = input.sparseBlock.indexes(n);
+			double[] avals = input.sparseBlock.values(n);
+			
+			// Iterate over the sparse block
+			for(int j=apos; j<apos+alen; j++) {
+				// Note: the input is of shape [N, CHW]
+				int chw = aix[j];
+				
+				// Get individual zero-based c,h,w indexes from zero-based 'chw'
+				int cInput = chw / HW;
+				int hInput = (chw - cInput*HW)/W;
+				int wInput = chw % W; 
+				
+				appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
+						R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
 			}
+			// Since the chw are appended in sorted order, no need to sort the output rows
 		}
 	}
 	
@@ -365,7 +339,7 @@ public class LibMatrixDNNIm2ColHelper {
 		int sMax = Math.min(S-1, wInput + pad_w);
 		// Constraint 3: (hInput - r + pad_h) % stride_h == 0
 		while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) rMin++;
-		while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++;	
+		while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++;
 		
 		for(int r = rMin; r <= rMax; r += stride_h) {
 			// Only append value if h == hInput, where h = (r - pad_h) + p*stride_h and 0 <= p < P
@@ -378,8 +352,6 @@ public class LibMatrixDNNIm2ColHelper {
 				// chw -> [crs, pq]
 				output.appendValue(outRowIndex + s, pQ + q, value);
 				// Since the chw are appended in sorted order, no need to sort the output rows
-				// if(meta.lastIndexPerRow[outRowIndex + s] > p*Q + q) meta.sortRows = true;
-				// meta.lastIndexPerRow[outRowIndex + s] = p*Q + q;
 			}
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index aedf975..181ff98 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -126,8 +126,7 @@ public class LibMatrixMult
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
 		ret.sparse = ultraSparse;
-		if( !ret.sparse )
-			ret.allocateDenseBlock();
+		ret.allocateBlock();
 		
 		//prepare row-upper for special cases of vector-matrix
 		boolean pm2 = checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
@@ -1094,7 +1093,7 @@ public class LibMatrixMult
 
 	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
 		throws DMLRuntimeException 
-	{	
+	{
 		double[] a = m1.denseBlock;
 		double[] c = ret.denseBlock;
 		int m = m1.rlen;
@@ -1104,42 +1103,37 @@ public class LibMatrixMult
 		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
 		if( LOW_LEVEL_OPTIMIZATION )
 		{
-			final int blocksizeI = 32; //256KB c block (typical L2 size per core), 32KB a block 
-			final int blocksizeK = 32; 
-			//note: in contrast to dense-dense, no blocking over j (would require maintaining blocksizeK indexes, counter-productive on skew)
-			
 			SparseBlock b = m2.sparseBlock;
 			
 			if( pm2 && m==1 )          //VECTOR-MATRIX
 			{
 				//parallelization over rows in rhs matrix
 				for( int k=rl; k<ru; k++ )
-					if( a[k] != 0 && !b.isEmpty(k) ) {								
+					if( a[k] != 0 && !b.isEmpty(k) ) {
 						vectMultiplyAdd(a[k], b.values(k), c, b.indexes(k), b.pos(k), 0, b.size(k));
 					}
 			}
 			else                       //MATRIX-MATRIX
 			{
+				//best effort blocking, without blocking over J because it is 
+				//counter-productive, even with front of current indexes
+				final int blocksizeK = 32;
+				final int blocksizeI = 32;
+				
 				//blocked execution
 				for( int bi = rl; bi < ru; bi+=blocksizeI )
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-					{
-						int bklen = Math.min(cd, bk+blocksizeK)-bk;
-						
+					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+						int bkmin = Math.min(cd, bk+blocksizeK);
 						//core sub block matrix multiplication
-			    		for( int i = bi; i < bimin; i++) 
-			    		{
-			    			int aixi = i * cd + bk; //start index on a
-			    			int cixj = i * n + 0; //scan index on c
-			    			
-			    			for( int k = 0; k < bklen; k++ )
-							{
-								double val = a[aixi+k];
-								if( val != 0 && !b.isEmpty(bk+k) ) {
-									vectMultiplyAdd(val, b.values(bk+k), c, b.indexes(bk+k), b.pos(bk+k), cixj, b.size(bk+k));
-								}
+						for(int i = bi, aix=bi*cd, cix=bi*n; i < bimin; i++, aix+=cd, cix+=n) {
+							for( int k = bk; k < bkmin; k++ ) {
+								double aval = a[aix+k];
+								if( aval == 0 || b.isEmpty(k) )
+									continue;
+								vectMultiplyAdd(aval, b.values(k), c, 
+									b.indexes(k), b.pos(k), cix, b.size(k));
 							}
-			    		}
+						}
 					}
 			}
 		}
@@ -1159,10 +1153,10 @@ public class LibMatrixMult
 							int[] bix = b.indexes(k);
 							double[] bvals = b.values(k);	
 							for(int j = bpos; j < bpos+blen; j++)
-								c[cix+bix[j]] += val * bvals[j];								
+								c[cix+bix[j]] += val * bvals[j];
 						}
 					}
-				}		
+				}
 		}
 	}
 
@@ -1190,7 +1184,7 @@ public class LibMatrixMult
 			{
 				for( int i=rl; i<ru; i++ )
 					if( !a.isEmpty(i) )
-						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));							
+						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
 			}
 			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
 			{
@@ -1206,13 +1200,13 @@ public class LibMatrixMult
 							int apos = a.pos(i);
 							int alen = a.size(i);
 							int[] aix = a.indexes(i);
-							double[] avals = a.values(i);					
-							int k = curk[i-bi] + apos;									
+							double[] avals = a.values(i);
+							int k = curk[i-bi] + apos;
 							for( ; k<apos+alen && aix[k]<bkmin; k++ )
 								c[i] += avals[k] * b[aix[k]];
 							curk[i-bi] = k - apos;
 						}
-					}	
+					}
 				}
 			}
 			else if( pm2 && m==1 )        //VECTOR-MATRIX
@@ -1222,7 +1216,7 @@ public class LibMatrixMult
 				{
 					int alen = a.size(0);
 					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);					
+					double[] avals = a.values(0);
 					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
 					rlix = (rlix>=0) ? rlix : alen;
 					
@@ -1243,7 +1237,7 @@ public class LibMatrixMult
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);					
+						double[] avals = a.values(i);
 						
 						int k1 = (rl==0) ? apos : a.posFIndexGTE(i, rl);
 						k1 = (k1>=0) ? k1 : apos+alen;
@@ -1274,7 +1268,7 @@ public class LibMatrixMult
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
+					double[] avals = a.values(i);
 					//rest not aligned to blocks of 4 rows
 					int bn = alen%4;
 					for( int k=apos; k<apos+bn; k++ )
@@ -1286,7 +1280,7 @@ public class LibMatrixMult
 				}	
 			}
 			else                          //MATRIX-MATRIX
-			{							
+			{
 				//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
 				//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
 				//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
@@ -1312,9 +1306,9 @@ public class LibMatrixMult
 									int apos = a.pos(i);
 									int alen = a.size(i);
 									int[] aix = a.indexes(i);
-									double[] avals = a.values(i);					
+									double[] avals = a.values(i);
 									
-									int k = curk[i-bi] + apos;			
+									int k = curk[i-bi] + apos;
 					    			//rest not aligned to blocks of 4 rows
 									int bn = alen%4;
 									for( ; k<apos+bn && aix[k]<bkmin; k++ )
@@ -1343,13 +1337,13 @@ public class LibMatrixMult
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
+					double[] avals = a.values(i);
 					
 					for(int k = apos; k < apos+alen; k++) {
 						double val = avals[k];
 						for(int j = 0, bix=aix[k]*n; j < n; j++)
-							c[cix+j] += val * b[bix+j];								
-					}						
+							c[cix+j] += val * b[bix+j];
+					}
 				}
 			}
 		}
@@ -1375,7 +1369,7 @@ public class LibMatrixMult
 				{
 					int alen = a.size(0);
 					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);					
+					double[] avals = a.values(0);
 					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
 					rlix = (rlix>=0) ? rlix : alen;
 					
@@ -1384,9 +1378,9 @@ public class LibMatrixMult
 							int bpos = b.pos(aix[k]);
 							int blen = b.size(aix[k]);
 							int[] bix = b.indexes(aix[k]);
-							double[] bvals = b.values(aix[k]);								
+							double[] bvals = b.values(aix[k]);
 							vectMultiplyAdd(avals[k], bvals, c, bix, bpos, 0, blen);
-						}			
+						}
 				}
 			}	
 			else                       //MATRIX-MATRIX