You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/15 09:52:53 UTC

[2/3] systemml git commit: [MINOR] Refactoring lib matrixmult/bincell (instruction footprint)

[MINOR] Refactoring lib matrixmult/bincell (instruction footprint)

This patch makes a minor refactoring of the libraries for matrix
multiplications and binary cell-wise operations in order to reduce the
instruction footprint and simplify JIT compilation. Specifically, the
methods for dense-dense mm, sparse-dense mm, and safe binary operations
have been split into methods for the major individual cases.

On an end-to-end cnn application, this patch reduced the number of
L1-icache misses from 6,055,257,066 to 5,663,272,573 and the number of
iTLB misses from 289,601,812 to 161,268,707.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a347af3b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a347af3b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a347af3b

Branch: refs/heads/master
Commit: a347af3b7b488ac3b296b9b9692f7172d60ac6f5
Parents: 17c5d5a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:22:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:22:55 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 446 +++++++------
 .../runtime/matrix/data/LibMatrixMult.java      | 665 ++++++++++---------
 2 files changed, 580 insertions(+), 531 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 7622137..2878b3b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -210,12 +210,9 @@ public class LibMatrixBincell
 		{
 			return;
 		}
-	
-		int rlen = m1.rlen;
-		int clen = m1.clen;
-		BinaryAccessType atype = getBinaryAccessType(m1, m2);
 		
-		if(    atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
+		BinaryAccessType atype = getBinaryAccessType(m1, m2);
+		if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
 			|| atype == BinaryAccessType.MATRIX_ROW_VECTOR)  
 		{
 			//note: m2 vector and hence always dense
@@ -232,213 +229,24 @@ public class LibMatrixBincell
 		}
 		else //MATRIX - MATRIX
 		{
-			if(m1.sparse && m2.sparse)
-			{
-				if(ret.sparse)
-					ret.allocateSparseRowsBlock();	
-				
-				//both sparse blocks existing
-				if(m1.sparseBlock!=null && m2.sparseBlock!=null)
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					if( ret.sparse && lsblock.isAligned(rsblock) )
-					{
-						SparseBlock c = ret.sparseBlock;
-						for(int r=0; r<rlen; r++) 
-							if( !lsblock.isEmpty(r) ) {
-								int alen = lsblock.size(r);
-								int apos = lsblock.pos(r);
-								int[] aix = lsblock.indexes(r);
-								double[] avals = lsblock.values(r);
-								double[] bvals = rsblock.values(r);
-								c.allocate(r, alen);
-								for( int j=apos; j<apos+alen; j++ ) {
-									double tmp = op.fn.execute(avals[j], bvals[j]);
-									c.append(r, aix[j], tmp);
-								}
-								ret.nonZeros += c.size(r);
-							}
-					}
-					else //general case
-					{	
-						for(int r=0; r<rlen; r++)
-						{
-							if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
-								mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
-										rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);	
-							}
-							else if( !rsblock.isEmpty(r) ) {
-								appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-										rsblock.pos(r), rsblock.size(r), 0, r, ret);
-							}
-							else if( !lsblock.isEmpty(r) ){
-								appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-										lsblock.pos(r), lsblock.size(r), 0, r, ret);
-							}
-							// do nothing if both not existing
-						}
-					}
-				}
-				//right sparse block existing
-				else if( m2.sparseBlock!=null )
-				{
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++)
-						if( !rsblock.isEmpty(r) )
-						{
-							appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-									rsblock.pos(r), rsblock.size(r), 0, r, ret);
-						}
-				}
-				//left sparse block existing
-				else
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					
-					for(int r=0; r<rlen; r++)
-						if( !lsblock.isEmpty(r) )
-						{
-							appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-									lsblock.pos(r), lsblock.size(r), 0, r, ret);
-						}
-				}
+			if(m1.sparse && m2.sparse) {
+				safeBinaryMMSparseSparse(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && (m1.sparse || m2.sparse) &&
-					(op.fn instanceof Plus || op.fn instanceof Minus ||
-					op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
-					(op.fn instanceof Multiply && !m2.sparse )))
-			{
-				//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] c = ret.denseBlock;
-				
-				//1) process left input: assignment
-				
-				if( m1.sparse ) //SPARSE left
-				{
-					if( m1.sparseBlock != null )
-					{
-						SparseBlock a = m1.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) )
-							{
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = avals[k];
-							}
-						}
-					}
-				}
-				else //DENSE left
-				{
-					if( !m1.isEmptyBlock(false) ) 
-						System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
-					else
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//2) process right input: op.fn (+,-,*), * only if dense
-				long lnnz = 0;
-				if( m2.sparse ) //SPARSE right
-				{				
-					if(m2.sparseBlock!=null)
-					{
-						SparseBlock a = m2.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) ) {
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
-							}
-							//exploit temporal locality of rows
-							lnnz += ret.recomputeNonZeros(i, i, 0, clen-1);
-						}
-					}
-				}
-				else //DENSE right
-				{
-					if( !m2.isEmptyBlock(false) ) {
-						double[] a = m2.denseBlock;
-						for( int i=0; i<m*n; i++ ) {
-							c[i] = op.fn.execute(c[i], a[i]);
-							lnnz += (c[i]!=0) ? 1 : 0;
-						}
-					}
-					else if(op.fn instanceof Multiply)
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//3) recompute nnz
-				ret.setNonZeros(lnnz);
+				(op.fn instanceof Plus || op.fn instanceof Minus ||
+				op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
+				(op.fn instanceof Multiply && !m2.sparse ))) {
+				safeBinaryMMSparseDenseDense(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && !m1.sparse && !m2.sparse 
-					&& m1.denseBlock!=null && m2.denseBlock!=null )
-			{
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] a = m1.denseBlock;
-				double[] b = m2.denseBlock;
-				double[] c = ret.denseBlock;
-				ValueFunction fn = op.fn;
-				
-				//compute dense-dense binary, maintain nnz on-the-fly
-				int lnnz = 0;
-				for( int i=0; i<m*n; i++ ) {
-					c[i] = fn.execute(a[i], b[i]);
-					lnnz += (c[i]!=0)? 1 : 0;
-				}
-				ret.setNonZeros(lnnz);
+				&& m1.denseBlock!=null && m2.denseBlock!=null ) {
+				safeBinaryMMDenseDenseDense(m1, m2, ret, op);
 			}
-			else if( skipEmpty && (m1.sparse || m2.sparse) ) 
-			{
-				SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
-				if( a == null )
-					return;
-				
-				//prepare second input and allocate output
-				MatrixBlock b = m1.sparse ? m2 : m1;
-				ret.allocateBlock();
-				
-				for( int i=0; i<a.numRows(); i++ ) {
-					if( a.isEmpty(i) ) continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					if( ret.sparse && !b.sparse )
-						ret.sparseBlock.allocate(i, alen);
-					for(int k = apos; k < apos+alen; k++) {
-						double in2 = b.quickGetValue(i, aix[k]);
-						if( in2==0 ) continue;
-						double val = op.fn.execute(avals[k], in2);
-						ret.appendValue(i, aix[k], val);
-					}
-				}
+			else if( skipEmpty && (m1.sparse || m2.sparse) ) {
+				safeBinaryMMSparseDenseSkip(m1, m2, ret, op);
 			}
-			else //generic case
-			{
-				for(int r=0; r<rlen; r++)
-					for(int c=0; c<clen; c++) {
-						double in1 = m1.quickGetValue(r, c);
-						double in2 = m2.quickGetValue(r, c);
-						if( in1==0 && in2==0) continue;
-						double val = op.fn.execute(in1, in2);
-						ret.appendValue(r, c, val);
-					}
+			else { //generic case
+				safeBinaryMMGeneric(m1, m2, ret, op);
 			}
 		}
 	}
@@ -694,7 +502,7 @@ public class LibMatrixBincell
 						for( int j=0; j<blen; j++ ) {
 							double v1 = m1.quickGetValue(i, bix[j]);
 							double v = op.fn.execute( v1, bvals[j] );
-							ret.appendValue(i, bix[j], v);					
+							ret.appendValue(i, bix[j], v);
 						}
 					}
 				}
@@ -731,19 +539,231 @@ public class LibMatrixBincell
 		}
 		else {
 			for(int r=0; r<rlen; r++) {
-				double v1 = m1.quickGetValue(r, 0);		
+				double v1 = m1.quickGetValue(r, 0);
 				for(int c=0; c<clen; c++)
 				{
 					double v2 = m2.quickGetValue(0, c);
 					double v = op.fn.execute( v1, v2 );
-					ret.appendValue(r, c, v);	
+					ret.appendValue(r, c, v);
 				}
-			}	
-		}	
-			
+			}
+		}
+		
 		//no need to recomputeNonZeros since maintained in append value
 	}
 	
+	private static void safeBinaryMMSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		final int rlen = m1.rlen;
+		if(ret.sparse)
+			ret.allocateSparseRowsBlock();
+		
+		//both sparse blocks existing
+		if(m1.sparseBlock!=null && m2.sparseBlock!=null)
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			SparseBlock rsblock = m2.sparseBlock;
+			
+			if( ret.sparse && lsblock.isAligned(rsblock) )
+			{
+				SparseBlock c = ret.sparseBlock;
+				for(int r=0; r<rlen; r++) 
+					if( !lsblock.isEmpty(r) ) {
+						int alen = lsblock.size(r);
+						int apos = lsblock.pos(r);
+						int[] aix = lsblock.indexes(r);
+						double[] avals = lsblock.values(r);
+						double[] bvals = rsblock.values(r);
+						c.allocate(r, alen);
+						for( int j=apos; j<apos+alen; j++ ) {
+							double tmp = op.fn.execute(avals[j], bvals[j]);
+							c.append(r, aix[j], tmp);
+						}
+						ret.nonZeros += c.size(r);
+					}
+			}
+			else //general case
+			{
+				for(int r=0; r<rlen; r++) {
+					if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
+						mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
+							rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
+					}
+					else if( !rsblock.isEmpty(r) ) {
+						appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+							rsblock.pos(r), rsblock.size(r), 0, r, ret);
+					}
+					else if( !lsblock.isEmpty(r) ){
+						appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+							lsblock.pos(r), lsblock.size(r), 0, r, ret);
+					}
+					// do nothing if both not existing
+				}
+			}
+		}
+		//right sparse block existing
+		else if( m2.sparseBlock!=null )
+		{
+			SparseBlock rsblock = m2.sparseBlock;
+			for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++) {
+				if( rsblock.isEmpty(r) ) continue;
+				appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+					rsblock.pos(r), rsblock.size(r), 0, r, ret);
+			}
+		}
+		//left sparse block existing
+		else
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			for(int r=0; r<rlen; r++) {
+				if( lsblock.isEmpty(r) ) continue;
+				appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+					lsblock.pos(r), lsblock.size(r), 0, r, ret);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMSparseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] c = ret.denseBlock;
+		
+		//1) process left input: assignment
+		
+		if( m1.sparse ) //SPARSE left
+		{
+			if( m1.sparseBlock != null )
+			{
+				SparseBlock a = m1.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) )
+					{
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = avals[k];
+					}
+				}
+			}
+		}
+		else //DENSE left
+		{
+			if( !m1.isEmptyBlock(false) ) 
+				System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
+			else
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//2) process right input: op.fn (+,-,*), * only if dense
+		long lnnz = 0;
+		if( m2.sparse ) //SPARSE right
+		{
+			if(m2.sparseBlock!=null)
+			{
+				SparseBlock a = m2.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) ) {
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
+					}
+					//exploit temporal locality of rows
+					lnnz += ret.recomputeNonZeros(i, i, 0, n-1);
+				}
+			}
+		}
+		else //DENSE right
+		{
+			if( !m2.isEmptyBlock(false) ) {
+				double[] a = m2.denseBlock;
+				for( int i=0; i<m*n; i++ ) {
+					c[i] = op.fn.execute(c[i], a[i]);
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+			else if(op.fn instanceof Multiply)
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//3) recompute nnz
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMDenseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] a = m1.denseBlock;
+		double[] b = m2.denseBlock;
+		double[] c = ret.denseBlock;
+		ValueFunction fn = op.fn;
+		
+		//compute dense-dense binary, maintain nnz on-the-fly
+		int lnnz = 0;
+		for( int i=0; i<m*n; i++ ) {
+			c[i] = fn.execute(a[i], b[i]);
+			lnnz += (c[i]!=0)? 1 : 0;
+		}
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMSparseDenseSkip(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
+		if( a == null )
+			return;
+		
+		//prepare second input and allocate output
+		MatrixBlock b = m1.sparse ? m2 : m1;
+		ret.allocateBlock();
+		
+		for( int i=0; i<a.numRows(); i++ ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			if( ret.sparse && !b.sparse )
+				ret.sparseBlock.allocate(i, alen);
+			for(int k = apos; k < apos+alen; k++) {
+				double in2 = b.quickGetValue(i, aix[k]);
+				if( in2==0 ) continue;
+				double val = op.fn.execute(avals[k], in2);
+				ret.appendValue(i, aix[k], val);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		int rlen = m1.rlen;
+		int clen = m2.clen;
+		for(int r=0; r<rlen; r++)
+			for(int c=0; c<clen; c++) {
+				double in1 = m1.quickGetValue(r, c);
+				double in2 = m2.quickGetValue(r, c);
+				if( in1==0 && in2==0) continue;
+				double val = op.fn.execute(in1, in2);
+				ret.appendValue(r, c, val);
+			}
+	}
+	
 	/**
 	 * 
 	 * This will do cell wise operation for &lt;, &lt;=, &gt;, &gt;=, == and != operators.
@@ -1254,6 +1274,4 @@ public class LibMatrixBincell
 			result.appendValue(resultRow, cols2[j], v);
 		}
 	}
-	
 }
-

http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fee73c5..eca26f6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -632,7 +632,7 @@ public class LibMatrixMult
 		ret.allocateBlock();
 		
 		try 
-		{			
+		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
 			int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -927,169 +927,189 @@ public class LibMatrixMult
 
 	private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException
-	{			
+	{
 		double[] a = m1.denseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
 		final int n = m2.clen;
 		final int cd = m1.clen;
-
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			if( m==1 && n==1 ) 		      //DOT PRODUCT
-			{
+		
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
 				c[0] = dotProduct(a, b, cd);
 			}
-			else if( n>1 && cd == 1 )     //OUTER PRODUCT
-			{
+			else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
 				for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
 					if( a[i] == 1 )
 						System.arraycopy(b, 0, c, cix, n);
-				    else if( a[i] != 0 )
+					else if( a[i] != 0 )
 						vectMultiplyWrite(a[i], b, c, 0, cix, n);
 					else
 						Arrays.fill(c, cix, cix+n, 0);
 				}
 			}
-			else if( n==1 && cd == 1 )    //VECTOR-SCALAR
-			{
+			else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
 				vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
-					c[i] = dotProduct(a, b, aix, 0, cd);	
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = 2*1024; //16KB vector blocks (L1) 
-				for( int bi=rl; bi<ru; bi+=blocksizeI ) {
-					int bimin = Math.min(bi+blocksizeI, ru);
-					for( int bk=0; bk<cd; bk+=blocksizeK ) {
-						int bkmin = Math.min(bk+blocksizeK, cd);
-						for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
-							c[i] += dotProduct(a, b, aix, bk, bkmin-bk);	
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				//rest not aligned to blocks of 2 rows
-				final int kn = (ru-rl)%2;
-				if( kn == 1 && a[rl] != 0 )
-					vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-				
-				//compute blocks of 2 rows (2 instead of 4 for small n<64) 
-				for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
-					if( a[k] != 0 && a[k+1] != 0  )
-						vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
-					else if( a[k] != 0 )
-						vectMultiplyAdd(a[k], b, c, bix, 0, n);
-					else if( a[k+1] != 0 )	
-						vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				//cache-conscious parallelization over rows in rhs matrix
-				final int kn = (ru-rl)%4;				
-				
-				//rest not aligned to blocks of 2 rows
-				for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
-					for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
-						if( a[aix+k] != 0 )
-							vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
-				
-				final int blocksizeK = 48;
-				final int blocksizeJ = 1024;
-				
-				//blocked execution
-				for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
-					for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) 
-					{
-						//compute blocks of 4 rows in rhs w/ IKJ
-						int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-						for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
-							for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
-								vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
-										b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
-							}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
 			}
-			else if( tm2 )                //MATRIX-MATRIX (skinny rhs)
-			{
-				//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
-				//however, explicit flag required since dimension change m2
-				final int n2 = m2.rlen;
-				for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
-					for( int j=0, bix=0; j<n2; j++, bix+=cd )
-						c[cix+j] = dotProduct(a, b, aix, bix, cd);
-			}
-			else                          //MATRIX-MATRIX
-			{	
-				//1) Unrolled inner loop (for better instruction-level parallelism)
-				//2) Blocked execution (for less cache trashing in parallel exec) 	
-				//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-				
-				final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
-				final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-				final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
-
-				//temporary arrays (nnz a, b index)
-				double[] ta = new double[ blocksizeK ];
-				int[]  tbi  = new int[ blocksizeK ];
-				
-				//blocked execution
-				for( int bi = rl; bi < ru; bi+=blocksizeI )
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-						for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
-						{
-							int bklen = bkmin-bk;
-							int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-				    		for( int i = bi; i < bimin; i++) 
-				    		{
-				    			int aixi = i * cd + bk; //start index on a
-				    			int cixj = i * n + bj; //scan index on c
-				    			
-				    			//determine nnz of a (for sparsity-aware skipping of rows)
-				    			int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
-				    			//if( knnz > 0 ) //for skipping empty rows
-				    			
-			    				//rest not aligned to blocks of 4 rows
-				    			final int bn = knnz % 4;
-				    			switch( bn ){
-					    			case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
-					    	    	case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
-					    			case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
-				    			}
-				    			
-				    			//compute blocks of 4 rows (core inner loop)
-				    			for( int k = bn; k<knnz; k+=4 ){
-				    				vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
-				    						          tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-				    			}
-				    		}
-						}
+			else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
+				matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
+			}
+			else {                          //MATRIX-MATRIX
+				matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
 			}
 		}
-		else
-		{
-			double val;
+		else {
 			for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n) 
-				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
-				{			
-					val = a[ aix ];
+				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n) {
+					double val = a[ aix ];
 					if( val != 0 )
 						for( int j = 0; j < n; j++) 
 							c[ cix+j ] += val * b[ bix+j ];
-				}	
+				}
 		}
+	}
+	
+	private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
+			c[i] = dotProduct(a, b, aix, 0, cd);
+	}
+	
+	private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		final int blocksizeI = 32;
+		final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+		for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+			int bimin = Math.min(bi+blocksizeI, ru);
+			for( int bk=0; bk<cd; bk+=blocksizeK ) {
+				int bkmin = Math.min(bk+blocksizeK, cd);
+				for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
+					c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+			}
+		}
+	}
+	
+	private static void matrixMultDenseDenseVM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//parallelization over rows in rhs matrix
+		//rest not aligned to blocks of 2 rows
+		final int kn = (ru-rl)%2;
+		if( kn == 1 && a[rl] != 0 )
+			vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
 		
+		//compute blocks of 2 rows (2 instead of 4 for small n<64) 
+		for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
+			if( a[k] != 0 && a[k+1] != 0  )
+				vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
+			else if( a[k] != 0 )
+				vectMultiplyAdd(a[k], b, c, bix, 0, n);
+			else if( a[k+1] != 0 )	
+				vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+		}
+	}
+	
+	private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] b, double[] c, int m, int n, int cd, int rl, int ru)
+		throws DMLRuntimeException
+	{
+		//cache-conscious parallelization over rows in rhs matrix
+		final int kn = (ru-rl)%4;
+		
+		//rest not aligned to blocks of 2 rows
+		for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
+			for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
+				if( a[aix+k] != 0 )
+					vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
+		
+		final int blocksizeK = 48;
+		final int blocksizeJ = 1024;
+		
+		//blocked execution
+		for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
+			for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+				//compute blocks of 4 rows in rhs w/ IKJ
+				int bjlen = Math.min(n, bj+blocksizeJ)-bj;
+				for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
+					for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
+						vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
+							b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
+					}
+			}
+	}
+	
+	private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, double[] b, double[] c, int n2, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
+		//however, explicit flag required since dimension change m2
+		for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
+			for( int j=0, bix=0; j<n2; j++, bix+=cd )
+				c[cix+j] = dotProduct(a, b, aix, bix, cd);
+	}
+	
+	private static void matrixMultDenseDenseMM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru, int cl, int cu) 
+		throws DMLRuntimeException
+	{
+		//1) Unrolled inner loop (for better instruction-level parallelism)
+		//2) Blocked execution (for less cache trashing in parallel exec) 	
+		//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
+		
+		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
+		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
+		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
+
+		//temporary arrays (nnz a, b index)
+		double[] ta = new double[ blocksizeK ];
+		int[]  tbi  = new int[ blocksizeK ];
+		
+		//blocked execution
+		for( int bi = rl; bi < ru; bi+=blocksizeI )
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
+				for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
+				{
+					int bklen = bkmin-bk;
+					int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
+					
+					//core sub block matrix multiplication
+					for( int i = bi; i < bimin; i++) 
+					{
+						int aixi = i * cd + bk; //start index on a
+						int cixj = i * n + bj; //scan index on c
+						
+						//determine nnz of a (for sparsity-aware skipping of rows)
+						int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+						//if( knnz > 0 ) //for skipping empty rows
+						
+						//rest not aligned to blocks of 4 rows
+						final int bn = knnz % 4;
+						switch( bn ){
+							case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
+							case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
+							case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
+						}
+						
+						//compute blocks of 4 rows (core inner loop)
+						for( int k = bn; k<knnz; k+=4 ){
+							vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
+									tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+						}
+					}
+				}
 	}
 
 	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
@@ -1163,7 +1183,8 @@ public class LibMatrixMult
 
 	private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
 		throws DMLRuntimeException
-	{	
+	{
+		SparseBlock a = m1.sparseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
@@ -1171,179 +1192,195 @@ public class LibMatrixMult
 		final int cd = m2.rlen;
 		final long xsp = (long)m*cd/m1.nonZeros;
 
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			SparseBlock a = m1.sparseBlock;
-			
-			if( m==1 && n==1 )            //DOT PRODUCT
-			{
-				if( !a.isEmpty(0) ) {
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
+				if( !a.isEmpty(0) )
 					c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
-				}
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl; i<ru; i++ )
-					if( !a.isEmpty(i) )
-						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultSparseDenseMVShortRHS(a, b, c, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1  
-				int[] curk = new int[blocksizeI];
-				
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
-						for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
-							if( a.isEmpty(i) ) continue;
-							int apos = a.pos(i);
-							int alen = a.size(i);
-							int[] aix = a.indexes(i);
-							double[] avals = a.values(i);
-							int k = curk[i-bi] + apos;
-							for( ; k<apos+alen && aix[k]<bkmin; k++ )
-								c[i] += avals[k] * b[aix[k]];
-							curk[i-bi] = k - apos;
-						}
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				if( !a.isEmpty(0) ) 
-				{
-					int alen = a.size(0);
-					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);
-					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
-					rlix = (rlix>=0) ? rlix : alen;
-					
-					for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
-						if( k+1<alen && aix[k+1]<ru )
-							vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
-						else
-							vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
-					}
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultSparseDenseVM(a, b, c, n, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				int arlen = a.numRows();
-				for( int i=0, cix=0; i<arlen; i++, cix+=n )
-					if( !a.isEmpty(i) ) 
-					{
-						int apos = a.pos(i);
-						int alen = a.size(i);
-						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);
-						
-						int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
-						k1 = (k1>=0) ? apos+k1 : apos+alen;
-						int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
-						k2 = (k2>=0) ? apos+k2 : apos+alen;
-						
-						//rest not aligned to blocks of 4 rows
-		    			final int bn = (k2-k1) % 4;
-		    			switch( bn ){
-			    			case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
-			    	    	case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
-			    			case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
-		    			}
-		    			
-		    			//compute blocks of 4 rows (core inner loop)
-		    			for( int k = k1+bn; k<k2; k+=4 ) {
-		    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-		    						          aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-		    			}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
 			}
-			else if( n<=64 )              //MATRIX-MATRIX (skinny rhs)
-			{
-				//no blocking since b and c fit into cache anyway
-				for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
-					if( a.isEmpty(i) ) 
-						continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					//rest not aligned to blocks of 4 rows
-					int bn = alen%4;
-					for( int k=apos; k<apos+bn; k++ )
-	    				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n); 
-	    			//compute blocks of 4 rows (core inner loop)
-	    			for( int k=apos+bn; k<apos+alen; k+=4 )
-	    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-	    					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-				}	
+			else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
+				matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
 			}
-			else                          //MATRIX-MATRIX
-			{
-				//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
-				//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
-				//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
-				//output - however, in this case it's unlikely that we consume every cache line in the rhs
-				
-				final int blocksizeI = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeK = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeJ = 1024; 
-				
-				//temporary array of current sparse positions
-				int[] curk = new int[blocksizeI];
-				
-				//blocked execution over IKJ 
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
-						for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
-							int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-							for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
-								if( !a.isEmpty(i) ) {
-									int apos = a.pos(i);
-									int alen = a.size(i);
-									int[] aix = a.indexes(i);
-									double[] avals = a.values(i);
-									
-									int k = curk[i-bi] + apos;
-					    			//rest not aligned to blocks of 4 rows
-									int bn = alen%4;
-									for( ; k<apos+bn && aix[k]<bkmin; k++ )
-					    				vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
-					    			//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
-					    			for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
-					    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-					    					aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
-					    			//update positions on last bj block
-					    			if( bj+bjlen==n )
-					    				curk[i-bi] = k - apos;
-								}
-							}
-						}
-					}
+			else {                          //MATRIX-MATRIX
+				matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
+			}
+		}
+		else {
+			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue; 
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				for(int k = apos; k < apos+alen; k++) {
+					double val = avals[k];
+					for(int j = 0, bix=aix[k]*n; j < n; j++)
+						c[cix+j] += val * b[bix+j];
 				}
 			}
 		}
-		else
-		{
-			SparseBlock a = m1.sparseBlock;
-			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
+	}
+	
+	private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, double[] b, double[] c, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl; i<ru; i++ )
+			if( !a.isEmpty(i) )
+				c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+	}
+	
+	private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, double[] b, double[] c, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		final int blocksizeI = 32;
+		final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
+		int[] curk = new int[blocksizeI];
+		
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+				for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
+					int k = curk[i-bi] + apos;
+					for( ; k<apos+alen && aix[k]<bkmin; k++ )
+						c[i] += avals[k] * b[aix[k]];
+					curk[i-bi] = k - apos;
+				}
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		if( a.isEmpty(0) )
+			return;
+		
+		//parallelization over rows in rhs matrix
+		int alen = a.size(0);
+		int[] aix = a.indexes(0);
+		double[] avals = a.values(0);
+		int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
+		rlix = (rlix>=0) ? rlix : alen;
+		
+		for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
+			if( k+1<alen && aix[k+1]<ru )
+				vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
+			else
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		int arlen = a.numRows();
+		for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			
+			int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
+			k1 = (k1>=0) ? apos+k1 : apos+alen;
+			int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
+			k2 = (k2>=0) ? apos+k2 : apos+alen;
+			
+			//rest not aligned to blocks of 4 rows
+			final int bn = (k2-k1) % 4;
+			switch( bn ){
+				case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
+				case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+				case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+			}
+			
+			//compute blocks of 4 rows (core inner loop)
+			for( int k = k1+bn; k<k2; k+=4 ) {
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//no blocking since b and c fit into cache anyway
+		for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			//rest not aligned to blocks of 4 rows
+			int bn = alen%4;
+			for( int k=apos; k<apos+bn; k++ )
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
+			//compute blocks of 4 rows (core inner loop)
+			for( int k=apos+bn; k<apos+alen; k+=4 )
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+		}
+	}
+	
+	private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, double[] c, int n, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
+		//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
+		//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
+		//output - however, in this case it's unlikely that we consume every cache line in the rhs
+		final int blocksizeI = (int) (8L*xsp);
+		final int blocksizeK = (int) (8L*xsp);
+		final int blocksizeJ = 1024; 
+		
+		//temporary array of current sparse positions
+		int[] curk = new int[blocksizeI];
+		
+		//blocked execution over IKJ 
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+				for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+					int bjlen = Math.min(n, bj+blocksizeJ)-bj;
 					
-					for(int k = apos; k < apos+alen; k++) {
-						double val = avals[k];
-						for(int j = 0, bix=aix[k]*n; j < n; j++)
-							c[cix+j] += val * b[bix+j];
+					//core sub block matrix multiplication
+					for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
+						if( !a.isEmpty(i) ) {
+							int apos = a.pos(i);
+							int alen = a.size(i);
+							int[] aix = a.indexes(i);
+							double[] avals = a.values(i);
+							
+							int k = curk[i-bi] + apos;
+							//rest not aligned to blocks of 4 rows
+							int bn = alen%4;
+							for( ; k<apos+bn && aix[k]<bkmin; k++ )
+								vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
+							//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
+							for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
+								vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+									aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
+							//update positions on last bj block
+							if( bj+bjlen==n )
+								curk[i-bi] = k - apos;
+						}
 					}
 				}
 			}
@@ -1408,8 +1445,8 @@ public class LibMatrixMult
 								int[] aix = a.indexes(i);
 								double[] avals = a.values(i);	
 								
-								int k = curk[i-bi] + apos;									
-				    			for(; k < apos+alen && aix[k]<bkmin; k++) {
+								int k = curk[i-bi] + apos;
+								for(; k < apos+alen && aix[k]<bkmin; k++) {
 									if( !b.isEmpty(aix[k]) )
 										vectMultiplyAdd(avals[k], b.values(aix[k]), c, 
 											b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
@@ -1423,28 +1460,22 @@ public class LibMatrixMult
 		}
 		else
 		{
-			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
-					
-					for(int k = apos; k < apos+alen; k++) 
-					{
-						double val = avals[k];
-						if( !b.isEmpty(aix[k]) ) 
-						{
-							int bpos = b.pos(aix[k]);
-							int blen = b.size(aix[k]);
-							int[] bix = b.indexes(aix[k]);
-							double[] bvals = b.values(aix[k]);	
-							for(int j = bpos; j < bpos+blen; j++)
-								c[cix+bix[j]] += val * bvals[j];								
-						}
-					}						
+			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue;
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				
+				for(int k = apos; k < apos+alen; k++) {
+					if( b.isEmpty(aix[k]) ) continue;
+					double val = avals[k];
+					int bpos = b.pos(aix[k]);
+					int blen = b.size(aix[k]);
+					int[] bix = b.indexes(aix[k]);
+					double[] bvals = b.values(aix[k]);
+					for(int j = bpos; j < bpos+blen; j++)
+						c[cix+bix[j]] += val * bvals[j];
 				}
 			}
 		}