You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/15 09:52:52 UTC

[1/3] systemml git commit: [SYSTEMML-1961] Performance dense max pooling (init, cases, nnz)

Repository: systemml
Updated Branches:
  refs/heads/master 98ee9b7d8 -> 33559144c


[SYSTEMML-1961] Performance dense max pooling (init, cases, nnz)

This patch makes a number of performance improvements to the existing
maxpooling builtin functions:

1) New special cases for (a) stride=1, pad=0, and (b) stride=1, pad=0,
P=1, Q=1, W=1, without materialization of start-end index arrays and
significantly simplified loop structures.

2) Thread local output initialization and nnz maintenance to reduce the
the serial fraction and improve temporal locality for multi-threaded
operations.

On a special case scenario of a 1K x 200*2048 input (~3.3GB) and
stride=1, pad=0, P=1, Q=1, and W=1, this patch improved performance from
253ms to 131ms, which is now at peak memory bandwidth. 


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/17c5d5aa
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/17c5d5aa
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/17c5d5aa

Branch: refs/heads/master
Commit: 17c5d5aae9669049164f59fccf3820b1c822c83f
Parents: 98ee9b7
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Oct 14 23:38:51 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 14 23:38:51 2017 -0700

----------------------------------------------------------------------
 .../cp/ConvolutionCPInstruction.java            |   3 -
 .../matrix/data/ConvolutionParameters.java      | 113 +++++++++++--------
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  33 +++---
 .../matrix/data/LibMatrixDNNPoolingHelper.java  |  88 ++++++++++-----
 .../integration/functions/tensor/PoolTest.java  | 110 ++++++++----------
 src/test/scripts/functions/tensor/PoolTest.R    |   7 +-
 src/test/scripts/functions/tensor/PoolTest.dml  |   3 +
 7 files changed, 193 insertions(+), 164 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f6ff998..2c7b972 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -20,7 +20,6 @@
 package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -354,8 +353,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			}
 			else {
 				outputBlock = new MatrixBlock(N, C*P*Q, false).allocateBlock();
-				if(instOpcode.equalsIgnoreCase("maxpooling"))
-					Arrays.fill(outputBlock.getDenseBlock(), -Double.MAX_VALUE);
 				LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
 			}
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index c66b5c6..d64a261 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -29,11 +29,13 @@ import org.apache.sysml.runtime.util.ConvolutionUtils;
  * This class is container that stores parameters required for executing following operations:
  * conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, maxpooling_backward 
  */
-public class ConvolutionParameters implements Serializable {
+public class ConvolutionParameters implements Serializable 
+{
 	private static final long serialVersionUID = -212362627205772829L;
-	public int N; public int C; public int H; public int W;
-	public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
-	public int P; public int Q; public int numThreads;
+	
+	public int N, C, H, W, K, R, S, P, Q;
+	public int stride_h, stride_w, pad_h, pad_w;
+	public int numThreads;
 	
 	// Optional variables used by ConvolutionCPInstruction
 	public boolean enableNative = false;
@@ -43,52 +45,9 @@ public class ConvolutionParameters implements Serializable {
 	public MatrixBlock bias;
 	public int [] start_indexes_h, end_indexes_h, start_indexes_w, end_indexes_w; 
 	
-	private static int convertToInt(long val) throws DMLRuntimeException {
-		if( val > Integer.MAX_VALUE )
-			throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val);
-		return (int) val;
-	}
-	
-	public boolean compare(ConvolutionParameters that) {
-		if(this.N == that.N && this.C == that.C && this.H == that.H && this.W == that.W
-				&& this.K == that.K && this.R == that.R && this.S == that.S && this.stride_h == that.stride_h
-				 && this.stride_w == that.stride_w  && this.pad_h == that.pad_h
-				  && this.pad_w == that.pad_w   && this.numThreads == that.numThreads) {
-			return true;
-		}
-		return false;
-	}
-	
-	@Override
-	public String toString() {
-		return "(NCHW=[" + N + " " + C + " " + H + " " + W + "], KCRS=[" + K + " " + R + " " + S + "], stride=[" + stride_h + "," + stride_w  + 
-				"], pad=[" + pad_h + "," + pad_w + "])";  
-	}
-	
-	public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
-			Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
-		if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
-		if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
-		if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
-		if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
-		if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
-		if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
-		if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
-		if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
-		if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
-		if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
-		if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
-		if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
-			this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
-		}
-		if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
-			this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
-		}
-		this.numThreads = numThreads;
-	}
-	
 	public ConvolutionParameters(long N, long C, long H, long W,
-			long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
+			long K, long R, long S, long stride_h, long stride_w, 
+			long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
 		this.N = convertToInt(N);
 		this.C = convertToInt(C);
 		this.H = convertToInt(H);
@@ -139,7 +98,63 @@ public class ConvolutionParameters implements Serializable {
 		this.numThreads = numThreads;
 	}
 	
+	private static int convertToInt(long val) throws DMLRuntimeException {
+		if( val > Integer.MAX_VALUE )
+			throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val);
+		return (int) val;
+	}
+	
+	public boolean compare(ConvolutionParameters that) {
+		if(this.N == that.N && this.C == that.C && this.H == that.H && this.W == that.W
+				&& this.K == that.K && this.R == that.R && this.S == that.S && this.stride_h == that.stride_h
+				 && this.stride_w == that.stride_w  && this.pad_h == that.pad_h
+				  && this.pad_w == that.pad_w   && this.numThreads == that.numThreads) {
+			return true;
+		}
+		return false;
+	}
+	
+	@Override
+	public String toString() {
+		return "(NCHW=[" + N + " " + C + " " + H + " " + W + "], KCRS=[" + K + " " + R + " " + S + "], stride=[" + stride_h + "," + stride_w  + 
+				"], pad=[" + pad_h + "," + pad_w + "])";  
+	}
+	
+	public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+			Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+		if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
+		if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
+		if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
+		if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
+		if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
+		if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
+		if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
+		if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
+		if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
+		if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
+		if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
+		if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
+			this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
+		}
+		if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
+			this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
+		}
+		this.numThreads = numThreads;
+	}
+	
 	public boolean isOutputThreadSafe() {
 		return output.isThreadSafe();
 	}
+	
+	public boolean isStride1Pad0() {
+		return (stride_h==1 && stride_w==1
+			&& pad_h==0 && pad_w==0);
+	}
+	
+	public boolean isAllOnes(Integer...params) {
+		boolean ret = true;
+		for(int param : params)
+			ret &= (param == 1);
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 3f67b1a..b967780 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -356,19 +356,15 @@ public class LibMatrixDNN {
 		params.end_indexes_h = new int[params.P];
 		params.start_indexes_w = new int[params.Q];
 		params.end_indexes_w = new int[params.Q];
-		for (int p = 0; p < params.P; p++) {
-			int start_index_h = p * params.stride_h - params.pad_h;
-			int end_index_h = start_index_h + params.R;
+		for( int p=0, ix=-params.pad_h; p < params.P; p++, ix+=params.stride_h ) {
 			// Note: We do not treat pad as zero
-			params.start_indexes_h[p] = Math.max(start_index_h, 0);
-			params.end_indexes_h[p] = Math.min(end_index_h, params.H);
+			params.start_indexes_h[p] = Math.max(ix, 0);
+			params.end_indexes_h[p] = Math.min(ix+params.R, params.H);
 		}
-		for (int q = 0; q < params.Q; q++) {
-			int start_index_w =  q * params.stride_w - params.pad_w;
-			int end_index_w = start_index_w + params.S;
+		for( int q=0, ix=-params.pad_w; q < params.Q; q++, ix+=params.stride_w) {
 			// Note: We do not treat pad as zero
-			params.start_indexes_w[q] = Math.max(start_index_w, 0);
-			params.end_indexes_w[q] = Math.min(end_index_w, params.W);
+			params.start_indexes_w[q] = Math.max(ix, 0);
+			params.end_indexes_w[q] = Math.min(ix+params.S, params.W);
 		}
 	}
 	
@@ -528,21 +524,24 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	public static void maxpooling(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+	public static void maxpooling(MatrixBlock input, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
-		params.output = outputBlock;
+		params.output = output;
 		
 		if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
-			throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W);
+			throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " 
+				+ input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W);
 		}
 		
-		fillIndexesArray(params);
+		//materialize indexes unless basic case with stride=1 and pad=0
+		if( !params.isStride1Pad0() || input.sparse )
+			fillIndexesArray(params);
 		
-		execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params);
+		long nnz = execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params);
 		
 		// post-processing: maintain nnz
-		outputBlock.recomputeNonZeros(); 
-		outputBlock.examSparsity();
+		output.setNonZeros(nnz);
+		output.examSparsity();
 	}
 	
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
index 19c3f71..52cdbcd 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
@@ -31,43 +31,58 @@ public class LibMatrixDNNPoolingHelper {
 	 */
 	public static class DenseMaxPooling implements Callable<Long> 
 	{
-		public int _rl; public int _ru; 
+		private final int _rl, _ru; 
 		private final ConvolutionParameters _params;
-		double [] inputArray; double [] outputArray;
-		int C; int P; int Q; int W;
+		
 		public DenseMaxPooling(int rl, int ru, ConvolutionParameters params) {
 			_rl = rl; _ru = ru;
 			_params = params;
-			inputArray = params.input1.getDenseBlock();
-			outputArray = params.output.getDenseBlock();
-			C = params.C; P = params.P; Q = params.Q; W = params.W;
 		}
 		
 		@Override
 		public Long call() throws Exception {
+			final int C = _params.C, P = _params.P, Q = _params.Q;
+			final int R = _params.R, S = _params.S, H = _params.H, W = _params.W;
 			final int HW = _params.H*_params.W;
 			final int CHW = _params.C*_params.H*_params.W;
 			final int CPQ = C*P*Q;
-			for(int n = _rl; n < _ru; n++)  {
-				final int inOffset = n*CHW;
-				int out_index = n*CPQ;
-				for (int c = 0; c < C; c++) {
-					final int inOffset1 = inOffset + c*HW;
-					for (int p = 0; p < P; p++) {
-						for (int q = 0; q < Q; q++, out_index++) {
-							double tmp = outputArray[out_index];
-							for (int h = _params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) {
-								int inputIndex = inOffset1 +  h*W + _params.start_indexes_w[q];
-								for (int w = _params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++, inputIndex++) {
-									tmp = Math.max(tmp, inputArray[inputIndex]);
-								}
-							}
-							outputArray[out_index] = tmp;
-						}
-					}
-				}
+			double[] in = _params.input1.getDenseBlock();
+			double[] out = _params.output.getDenseBlock();
+			
+			//thread-local initialization of output block 
+			if( !(_params.isStride1Pad0() && _params.isAllOnes(P, Q, W)) )
+				Arrays.fill(out, _rl*CPQ, _ru*CPQ, -Double.MAX_VALUE);
+			
+			if( _params.isStride1Pad0() && _params.isAllOnes(P, Q, W) ) { 
+				//quick-path w/o materialized index arrays and 
+				//simplified inner loops for P = 1, Q = 1, W = 1
+				int lenh = Math.min(R,H);
+				for(int i = _rl, oix=_rl*C; i < _ru; i++, oix+=C)
+					for (int c = 0, off=i*CHW; c < C; c++, off+=H)
+						out[oix+c] = max(-Double.MAX_VALUE, in, off, lenh);
+			}
+			else if( _params.isStride1Pad0() ) {
+				//quick-path w/o materialized index arrays 
+				for(int i = _rl; i < _ru; i++)
+					for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW)
+						for (int p = 0; p < P; p++, oix+=Q)
+							for (int h = p; h < Math.min(p+R,H); h++)
+								for (int q = 0, off2=off+h*W; q < Q; q++)
+									out[oix+q] = max(out[oix+q], in, off2+q, Math.min(S,W-q));
+			}
+			else { //general case
+				int[] hl = _params.start_indexes_h, hu = _params.end_indexes_h;
+				int[] wl = _params.start_indexes_w, wu = _params.end_indexes_w;
+				for(int i = _rl; i < _ru; i++)
+					for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW)
+						for (int p = 0; p < P; p++, oix+=Q)
+							for (int h = hl[p]; h < hu[p]; h++)
+								for (int q = 0, off2=off+h*W; q < Q; q++)
+									out[oix+q] = max(out[oix+q], in, off2+wl[q], wu[q]-wl[q]);
 			}
-			return 0L;
+			
+			//thread-local recomputation of non-zeros
+			return _params.output.recomputeNonZeros(_rl, _ru-1);
 		}
 	}
 	
@@ -76,24 +91,26 @@ public class LibMatrixDNNPoolingHelper {
 	 */
 	public static class SparseMaxPooling implements Callable<Long> 
 	{
-		public int _rl; public int _ru; 
+		private final int _rl, _ru; 
 		private final ConvolutionParameters _params;
-		final int HW;
-		double [] outputArray;
-		final int C; final int P; final int Q; final int W; final int H; final int CPQ; final int PQ;
+		private double [] outputArray;
+		private final int C, P, Q, W, H, CPQ, PQ;
+		
 		public SparseMaxPooling(int rl, int ru, ConvolutionParameters params) {
 			_rl = rl; _ru = ru;
 			_params = params;
 			outputArray = params.output.getDenseBlock();
 			C = params.C; P = params.P; Q = params.Q; H = params.H; 
 			W = params.W;
-			HW = _params.H*_params.W;
 			CPQ = C*P*Q;
 			PQ = P*Q;
 		}
 		
 		@Override
 		public Long call() throws Exception {
+			//thread-local initialization of output block 
+			Arrays.fill(outputArray, _rl *CPQ, _ru*CPQ, -Double.MAX_VALUE);
+			
 			for(int n = _rl; n < _ru; n++)  {
 				if( !_params.input1.sparseBlock.isEmpty(n) ) {
 					final int apos = _params.input1.sparseBlock.pos(n);
@@ -136,7 +153,16 @@ public class LibMatrixDNNPoolingHelper {
 					Arrays.fill(outputArray, n*CPQ, (n+1)*CPQ, 0);
 				}
 			}
-			return 0L;
+			
+			//thread-local recomputation of non-zeros
+			return _params.output.recomputeNonZeros(_rl, _ru-1);
 		}
 	}
+	
+	private static double max(final double aval, double[] b, final int bi, final int len) {
+		double ret = aval;
+		for( int i = bi; i < bi+len; i++ )
+			ret = Math.max(ret, b[i]);
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
index e1c84c5..1acb14c 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
@@ -51,146 +51,130 @@ public class PoolTest extends AutomatedTestBase
 	}
 	
 	@Test
-	public void testMaxPool2DDense2() 
-	{
+	public void testMaxPool2DDense2() {
 		int numImg = 2; int imgSize = 6; int numChannels = 1;  int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
-	
 	@Test
-	public void testMaxPool2DDense3() 
-	{
+	public void testMaxPool2DDense3() {
 		int numImg = 3; int imgSize = 7; int numChannels = 2;  int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
 	@Test
-	public void testMaxPool2DDense4() 
-	{
+	public void testMaxPool2DDense4() {
 		int numImg = 2; int imgSize = 4; int numChannels = 2;  int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
 	@Test
-	public void testMaxPool2DSparse1() 
-	{
+	public void testMaxPool2DDense5() {
+		int numImg = 2; int imgSize = 8; int numChannels = 4;  int stride = 1; int pad = 0; int poolSize1 = imgSize*imgSize; int poolSize2 = 1;
+		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max2", false);
+	}
+	
+	@Test
+	public void testMaxPool2DSparse1() {
 		int numImg = 1; int imgSize = 6; int numChannels = 1;  int stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
 	}
 	
 	@Test
-	public void testMaxPool2DSparse2() 
-	{
+	public void testMaxPool2DSparse2() {
 		int numImg = 2; int imgSize = 6; int numChannels = 1;  int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
 	}
 	
-	
 	@Test
-	public void testMaxPool2DSparse3() 
-	{
+	public void testMaxPool2DSparse3() {
 		int numImg = 3; int imgSize = 7; int numChannels = 2;  int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
 	}
 	
 	@Test
-	public void testMaxPool2DSparse4() 
-	{
+	public void testMaxPool2DSparse4() {
 		int numImg = 2; int imgSize = 4; int numChannels = 2;  int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
 	}
 	
+	@Test
+	public void testMaxPool2DSparse5() {
+		int numImg = 2; int imgSize = 32; int numChannels = 4;  int stride = 1; int pad = 0; int poolSize1 = imgSize*imgSize; int poolSize2 = 1;
+		runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max2", true);
+	}
+	
 	// ----------------------------------------
 	
 	@Test
-	public void testMaxPool2DDense1SP() 
-	{
+	public void testMaxPool2DDense1SP() {
 		int numImg = 1; int imgSize = 50; int numChannels = 1;  int stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
 		runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
 	@Test
-	public void testMaxPool2DDense2SP() 
-	{
+	public void testMaxPool2DDense2SP() {
 		int numImg = 2; int imgSize = 6; int numChannels = 1;  int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
 		runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
-	
 	@Test
-	public void testMaxPool2DDense3SP() 
-	{
+	public void testMaxPool2DDense3SP() {
 		int numImg = 3; int imgSize = 7; int numChannels = 2;  int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
 	@Test
-	public void testMaxPool2DDense4SP() 
-	{
+	public void testMaxPool2DDense4SP() {
 		int numImg = 2; int imgSize = 4; int numChannels = 2;  int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
 		runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
 	}
 	
-	/**
-	 * 
-	 * @param et
-	 * @param sparse
-	 */
 	public void runPoolTest( ExecType et, int imgSize, int numImg, int numChannels, int stride, 
 			int pad, int poolSize1, int poolSize2, String poolMode, boolean sparse) 
 	{
-		RUNTIME_PLATFORM oldRTP = rtplatform;
-			
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
 		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
 		
 		try
 		{
-			String sparseVal = (""+sparse).toUpperCase();
-	    TestConfiguration config = getTestConfiguration(TEST_NAME);
-	    if(et == ExecType.SPARK) {
-	    	rtplatform = RUNTIME_PLATFORM.SPARK;
-	    }
-	    else {
-	    	rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
-	    }
-			if( rtplatform == RUNTIME_PLATFORM.SPARK )
-				DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+			String sparseVal = String.valueOf(sparse).toUpperCase();
 			
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
 			loadTestConfiguration(config);
-	        
-			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+	
 			String RI_HOME = SCRIPT_DIR + TEST_DIR;
 			fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-			
-			programArgs = new String[]{"-explain", "-args",  "" + imgSize, "" + numImg, 
-					"" + numChannels, "" + poolSize1, "" + poolSize2, 
-					"" + stride, "" + pad, poolMode, 
-					output("B"), sparseVal};
-			        
-			boolean exceptionExpected = false;
-			int expectedNumberOfJobs = -1;
-			runTest(true, exceptionExpected, null, expectedNumberOfJobs);
+			programArgs = new String[]{"-explain", "-args", String.valueOf(imgSize), 
+				String.valueOf(numImg), String.valueOf(numChannels),
+				String.valueOf(poolSize1), String.valueOf(poolSize2),
+				String.valueOf(stride), String.valueOf(pad), poolMode,
+				output("B"), sparseVal};
 			
 			fullRScriptName = RI_HOME + TEST_NAME + ".R";
 			rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + 
-					" " + numChannels + " " + poolSize1 + 
-					" " + poolSize2 + " " + stride + " " + pad + " " + expectedDir() + " " + sparseVal; 
+				" " + numChannels + " " + poolSize1 + " " + poolSize2 + " " + stride + 
+				" " + pad + " " + expectedDir() + " " + sparseVal + " " + poolMode; 
 			
-			// Run comparison R script
+			// run scripts
+			runTest(true, false, null, -1);
 			runRScript(true);
-			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			
+			//compare results
+			HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
 			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
 			TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy");
-			
 		}
-		finally
-		{
-			rtplatform = oldRTP;
+		finally {
+			rtplatform = platformOld;
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}
-	
-	
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/scripts/functions/tensor/PoolTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.R b/src/test/scripts/functions/tensor/PoolTest.R
index aef0384..fdcba56 100644
--- a/src/test/scripts/functions/tensor/PoolTest.R
+++ b/src/test/scripts/functions/tensor/PoolTest.R
@@ -28,6 +28,7 @@ poolSize1=as.integer(args[4])
 poolSize2=as.integer(args[5])
 stride=as.integer(args[6])
 pad=as.integer(args[7])
+mode=args[10]
 
 # Assumption: NCHW image format
 x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize*imgSize, byrow=TRUE)
@@ -98,6 +99,10 @@ max_pool <- function(X, N, C, Hin, Win, Hf, Wf,
   out
 }
 
-output = max_pool(x, numImg, numChannels, imgSize, imgSize, poolSize1, poolSize2, stride, stride)
+if( mode=="max" ) {
+  output = max_pool(x, numImg, numChannels, imgSize, imgSize, poolSize1, poolSize2, stride, stride)
+} else {
+  output = max_pool(x, numImg, numChannels, imgSize*imgSize, 1, poolSize1, poolSize2, stride, stride)
+}
 
 writeMM(as(output,"CsparseMatrix"), paste(args[8], "B", sep=""))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/scripts/functions/tensor/PoolTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.dml b/src/test/scripts/functions/tensor/PoolTest.dml
index 5246a2d..5f20fae 100644
--- a/src/test/scripts/functions/tensor/PoolTest.dml
+++ b/src/test/scripts/functions/tensor/PoolTest.dml
@@ -39,6 +39,9 @@ else {
 if(poolMode == "max") {
 	output = max_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2])
 }
+else if(poolMode == "max2") {
+	output = max_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize*imgSize, 1], pool_size=[poolSize1, poolSize2])
+}
 #else {
 	#output = avg_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2])
 #}

[3/3] systemml git commit: [HOTFIX][SYSTEMML-1959] Fix sparse-sparse transpose w/ CSR input

Posted by mb...@apache.org.

[HOTFIX][SYSTEMML-1959] Fix sparse-sparse transpose w/ CSR input

This patch fixes a remaining issue of sparse-sparse transpose related to
the correct handling of sparse blocks in CSR or COO format.

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/33559144
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/33559144
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/33559144

Branch: refs/heads/master
Commit: 33559144cd707e324b59ed5ca417e3d5461c2f0a
Parents: a347af3
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:42:20 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:42:20 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/LibMatrixReorg.java   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/33559144/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 3ae07c5..dd86c27 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -859,8 +859,8 @@ public class LibMatrixReorg
 			if( cl > 0 ) {
 				for( int i=bi; i<bimin; i++ )
 					if( !a.isEmpty(i) ) {
-						int pos = a.posFIndexGTE(i, cl);
-						ix[i-bi] = (pos>=0) ? pos : a.size(i);
+						int j = a.posFIndexGTE(i, cl);
+						ix[i-bi] = (j>=0) ? j : a.size(i);
 					}
 			}
 			
@@ -868,19 +868,19 @@ public class LibMatrixReorg
 				int bjmin = Math.min(bj+blocksizeJ, cu);
 
 				//core block transpose operation
-				for( int i=bi, iix=0; i<bimin; i++, iix++ ) {
+				for( int i=bi; i<bimin; i++ ) {
 					if( a.isEmpty(i) ) continue;
 					
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
-					int j = ix[iix]; //last block boundary
-					for( ; j<alen && aix[j]<bjmin; j++ ) {
-						c.allocate(aix[apos+j], ennz2,n2);
-						c.append(aix[apos+j], i, avals[apos+j]);
+					int j = ix[i-bi] + apos; //last block boundary
+					for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
+						c.allocate(aix[j], ennz2,n2);
+						c.append(aix[j], i, avals[j]);
 					}
-					ix[iix] = j; //keep block boundary
+					ix[i-bi] = j - apos; //keep block boundary
 				}
 			}
 		}

[2/3] systemml git commit: [MINOR] Refactoring lib matrixmult/bincell (instruction footprint)

Posted by mb...@apache.org.

[MINOR] Refactoring lib matrixmult/bincell (instruction footprint)

This patch makes a minor refactoring of the libraries for matrix
multiplications and binary cell-wise operations in order to reduce the
instruction footprint and simplify JIT compilation. Specifically, the
methods for dense-dense mm, sparse-dense mm, and safe binary operations
have been split into methods for the major individual cases.

On an end-to-end cnn application, this patch reduced the number of
L1-icache misses from 6,055,257,066 to 5,663,272,573 and the number of
iTLB misses from 289,601,812 to 161,268,707.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a347af3b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a347af3b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a347af3b

Branch: refs/heads/master
Commit: a347af3b7b488ac3b296b9b9692f7172d60ac6f5
Parents: 17c5d5a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:22:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:22:55 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 446 +++++++------
 .../runtime/matrix/data/LibMatrixMult.java      | 665 ++++++++++---------
 2 files changed, 580 insertions(+), 531 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 7622137..2878b3b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -210,12 +210,9 @@ public class LibMatrixBincell
 		{
 			return;
 		}
-	
-		int rlen = m1.rlen;
-		int clen = m1.clen;
-		BinaryAccessType atype = getBinaryAccessType(m1, m2);
 		
-		if(    atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
+		BinaryAccessType atype = getBinaryAccessType(m1, m2);
+		if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
 			|| atype == BinaryAccessType.MATRIX_ROW_VECTOR)  
 		{
 			//note: m2 vector and hence always dense
@@ -232,213 +229,24 @@ public class LibMatrixBincell
 		}
 		else //MATRIX - MATRIX
 		{
-			if(m1.sparse && m2.sparse)
-			{
-				if(ret.sparse)
-					ret.allocateSparseRowsBlock();	
-				
-				//both sparse blocks existing
-				if(m1.sparseBlock!=null && m2.sparseBlock!=null)
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					if( ret.sparse && lsblock.isAligned(rsblock) )
-					{
-						SparseBlock c = ret.sparseBlock;
-						for(int r=0; r<rlen; r++) 
-							if( !lsblock.isEmpty(r) ) {
-								int alen = lsblock.size(r);
-								int apos = lsblock.pos(r);
-								int[] aix = lsblock.indexes(r);
-								double[] avals = lsblock.values(r);
-								double[] bvals = rsblock.values(r);
-								c.allocate(r, alen);
-								for( int j=apos; j<apos+alen; j++ ) {
-									double tmp = op.fn.execute(avals[j], bvals[j]);
-									c.append(r, aix[j], tmp);
-								}
-								ret.nonZeros += c.size(r);
-							}
-					}
-					else //general case
-					{	
-						for(int r=0; r<rlen; r++)
-						{
-							if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
-								mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
-										rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);	
-							}
-							else if( !rsblock.isEmpty(r) ) {
-								appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-										rsblock.pos(r), rsblock.size(r), 0, r, ret);
-							}
-							else if( !lsblock.isEmpty(r) ){
-								appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-										lsblock.pos(r), lsblock.size(r), 0, r, ret);
-							}
-							// do nothing if both not existing
-						}
-					}
-				}
-				//right sparse block existing
-				else if( m2.sparseBlock!=null )
-				{
-					SparseBlock rsblock = m2.sparseBlock;
-					
-					for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++)
-						if( !rsblock.isEmpty(r) )
-						{
-							appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
-									rsblock.pos(r), rsblock.size(r), 0, r, ret);
-						}
-				}
-				//left sparse block existing
-				else
-				{
-					SparseBlock lsblock = m1.sparseBlock;
-					
-					for(int r=0; r<rlen; r++)
-						if( !lsblock.isEmpty(r) )
-						{
-							appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
-									lsblock.pos(r), lsblock.size(r), 0, r, ret);
-						}
-				}
+			if(m1.sparse && m2.sparse) {
+				safeBinaryMMSparseSparse(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && (m1.sparse || m2.sparse) &&
-					(op.fn instanceof Plus || op.fn instanceof Minus ||
-					op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
-					(op.fn instanceof Multiply && !m2.sparse )))
-			{
-				//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] c = ret.denseBlock;
-				
-				//1) process left input: assignment
-				
-				if( m1.sparse ) //SPARSE left
-				{
-					if( m1.sparseBlock != null )
-					{
-						SparseBlock a = m1.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) )
-							{
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = avals[k];
-							}
-						}
-					}
-				}
-				else //DENSE left
-				{
-					if( !m1.isEmptyBlock(false) ) 
-						System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
-					else
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//2) process right input: op.fn (+,-,*), * only if dense
-				long lnnz = 0;
-				if( m2.sparse ) //SPARSE right
-				{				
-					if(m2.sparseBlock!=null)
-					{
-						SparseBlock a = m2.sparseBlock;
-						
-						for( int i=0, ix=0; i<m; i++, ix+=n ) {
-							if( !a.isEmpty(i) ) {
-								int apos = a.pos(i);
-								int alen = a.size(i);
-								int[] aix = a.indexes(i);
-								double[] avals = a.values(i);
-								for(int k = apos; k < apos+alen; k++) 
-									c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
-							}
-							//exploit temporal locality of rows
-							lnnz += ret.recomputeNonZeros(i, i, 0, clen-1);
-						}
-					}
-				}
-				else //DENSE right
-				{
-					if( !m2.isEmptyBlock(false) ) {
-						double[] a = m2.denseBlock;
-						for( int i=0; i<m*n; i++ ) {
-							c[i] = op.fn.execute(c[i], a[i]);
-							lnnz += (c[i]!=0) ? 1 : 0;
-						}
-					}
-					else if(op.fn instanceof Multiply)
-						Arrays.fill(ret.denseBlock, 0, m*n, 0); 
-				}
-				
-				//3) recompute nnz
-				ret.setNonZeros(lnnz);
+				(op.fn instanceof Plus || op.fn instanceof Minus ||
+				op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
+				(op.fn instanceof Multiply && !m2.sparse ))) {
+				safeBinaryMMSparseDenseDense(m1, m2, ret, op);
 			}
 			else if( !ret.sparse && !m1.sparse && !m2.sparse 
-					&& m1.denseBlock!=null && m2.denseBlock!=null )
-			{
-				ret.allocateDenseBlock();
-				final int m = ret.rlen;
-				final int n = ret.clen;
-				double[] a = m1.denseBlock;
-				double[] b = m2.denseBlock;
-				double[] c = ret.denseBlock;
-				ValueFunction fn = op.fn;
-				
-				//compute dense-dense binary, maintain nnz on-the-fly
-				int lnnz = 0;
-				for( int i=0; i<m*n; i++ ) {
-					c[i] = fn.execute(a[i], b[i]);
-					lnnz += (c[i]!=0)? 1 : 0;
-				}
-				ret.setNonZeros(lnnz);
+				&& m1.denseBlock!=null && m2.denseBlock!=null ) {
+				safeBinaryMMDenseDenseDense(m1, m2, ret, op);
 			}
-			else if( skipEmpty && (m1.sparse || m2.sparse) ) 
-			{
-				SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
-				if( a == null )
-					return;
-				
-				//prepare second input and allocate output
-				MatrixBlock b = m1.sparse ? m2 : m1;
-				ret.allocateBlock();
-				
-				for( int i=0; i<a.numRows(); i++ ) {
-					if( a.isEmpty(i) ) continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					if( ret.sparse && !b.sparse )
-						ret.sparseBlock.allocate(i, alen);
-					for(int k = apos; k < apos+alen; k++) {
-						double in2 = b.quickGetValue(i, aix[k]);
-						if( in2==0 ) continue;
-						double val = op.fn.execute(avals[k], in2);
-						ret.appendValue(i, aix[k], val);
-					}
-				}
+			else if( skipEmpty && (m1.sparse || m2.sparse) ) {
+				safeBinaryMMSparseDenseSkip(m1, m2, ret, op);
 			}
-			else //generic case
-			{
-				for(int r=0; r<rlen; r++)
-					for(int c=0; c<clen; c++) {
-						double in1 = m1.quickGetValue(r, c);
-						double in2 = m2.quickGetValue(r, c);
-						if( in1==0 && in2==0) continue;
-						double val = op.fn.execute(in1, in2);
-						ret.appendValue(r, c, val);
-					}
+			else { //generic case
+				safeBinaryMMGeneric(m1, m2, ret, op);
 			}
 		}
 	}
@@ -694,7 +502,7 @@ public class LibMatrixBincell
 						for( int j=0; j<blen; j++ ) {
 							double v1 = m1.quickGetValue(i, bix[j]);
 							double v = op.fn.execute( v1, bvals[j] );
-							ret.appendValue(i, bix[j], v);					
+							ret.appendValue(i, bix[j], v);
 						}
 					}
 				}
@@ -731,19 +539,231 @@ public class LibMatrixBincell
 		}
 		else {
 			for(int r=0; r<rlen; r++) {
-				double v1 = m1.quickGetValue(r, 0);		
+				double v1 = m1.quickGetValue(r, 0);
 				for(int c=0; c<clen; c++)
 				{
 					double v2 = m2.quickGetValue(0, c);
 					double v = op.fn.execute( v1, v2 );
-					ret.appendValue(r, c, v);	
+					ret.appendValue(r, c, v);
 				}
-			}	
-		}	
-			
+			}
+		}
+		
 		//no need to recomputeNonZeros since maintained in append value
 	}
 	
+	private static void safeBinaryMMSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		final int rlen = m1.rlen;
+		if(ret.sparse)
+			ret.allocateSparseRowsBlock();
+		
+		//both sparse blocks existing
+		if(m1.sparseBlock!=null && m2.sparseBlock!=null)
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			SparseBlock rsblock = m2.sparseBlock;
+			
+			if( ret.sparse && lsblock.isAligned(rsblock) )
+			{
+				SparseBlock c = ret.sparseBlock;
+				for(int r=0; r<rlen; r++) 
+					if( !lsblock.isEmpty(r) ) {
+						int alen = lsblock.size(r);
+						int apos = lsblock.pos(r);
+						int[] aix = lsblock.indexes(r);
+						double[] avals = lsblock.values(r);
+						double[] bvals = rsblock.values(r);
+						c.allocate(r, alen);
+						for( int j=apos; j<apos+alen; j++ ) {
+							double tmp = op.fn.execute(avals[j], bvals[j]);
+							c.append(r, aix[j], tmp);
+						}
+						ret.nonZeros += c.size(r);
+					}
+			}
+			else //general case
+			{
+				for(int r=0; r<rlen; r++) {
+					if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
+						mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
+							rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
+					}
+					else if( !rsblock.isEmpty(r) ) {
+						appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+							rsblock.pos(r), rsblock.size(r), 0, r, ret);
+					}
+					else if( !lsblock.isEmpty(r) ){
+						appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+							lsblock.pos(r), lsblock.size(r), 0, r, ret);
+					}
+					// do nothing if both not existing
+				}
+			}
+		}
+		//right sparse block existing
+		else if( m2.sparseBlock!=null )
+		{
+			SparseBlock rsblock = m2.sparseBlock;
+			for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++) {
+				if( rsblock.isEmpty(r) ) continue;
+				appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r), 
+					rsblock.pos(r), rsblock.size(r), 0, r, ret);
+			}
+		}
+		//left sparse block existing
+		else
+		{
+			SparseBlock lsblock = m1.sparseBlock;
+			for(int r=0; r<rlen; r++) {
+				if( lsblock.isEmpty(r) ) continue;
+				appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), 
+					lsblock.pos(r), lsblock.size(r), 0, r, ret);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMSparseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		//specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] c = ret.denseBlock;
+		
+		//1) process left input: assignment
+		
+		if( m1.sparse ) //SPARSE left
+		{
+			if( m1.sparseBlock != null )
+			{
+				SparseBlock a = m1.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) )
+					{
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = avals[k];
+					}
+				}
+			}
+		}
+		else //DENSE left
+		{
+			if( !m1.isEmptyBlock(false) ) 
+				System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
+			else
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//2) process right input: op.fn (+,-,*), * only if dense
+		long lnnz = 0;
+		if( m2.sparse ) //SPARSE right
+		{
+			if(m2.sparseBlock!=null)
+			{
+				SparseBlock a = m2.sparseBlock;
+				
+				for( int i=0, ix=0; i<m; i++, ix+=n ) {
+					if( !a.isEmpty(i) ) {
+						int apos = a.pos(i);
+						int alen = a.size(i);
+						int[] aix = a.indexes(i);
+						double[] avals = a.values(i);
+						for(int k = apos; k < apos+alen; k++) 
+							c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
+					}
+					//exploit temporal locality of rows
+					lnnz += ret.recomputeNonZeros(i, i, 0, n-1);
+				}
+			}
+		}
+		else //DENSE right
+		{
+			if( !m2.isEmptyBlock(false) ) {
+				double[] a = m2.denseBlock;
+				for( int i=0; i<m*n; i++ ) {
+					c[i] = op.fn.execute(c[i], a[i]);
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+			else if(op.fn instanceof Multiply)
+				Arrays.fill(ret.denseBlock, 0, m*n, 0); 
+		}
+		
+		//3) recompute nnz
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMDenseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		ret.allocateDenseBlock();
+		final int m = ret.rlen;
+		final int n = ret.clen;
+		double[] a = m1.denseBlock;
+		double[] b = m2.denseBlock;
+		double[] c = ret.denseBlock;
+		ValueFunction fn = op.fn;
+		
+		//compute dense-dense binary, maintain nnz on-the-fly
+		int lnnz = 0;
+		for( int i=0; i<m*n; i++ ) {
+			c[i] = fn.execute(a[i], b[i]);
+			lnnz += (c[i]!=0)? 1 : 0;
+		}
+		ret.setNonZeros(lnnz);
+	}
+	
+	private static void safeBinaryMMSparseDenseSkip(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
+		if( a == null )
+			return;
+		
+		//prepare second input and allocate output
+		MatrixBlock b = m1.sparse ? m2 : m1;
+		ret.allocateBlock();
+		
+		for( int i=0; i<a.numRows(); i++ ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			if( ret.sparse && !b.sparse )
+				ret.sparseBlock.allocate(i, alen);
+			for(int k = apos; k < apos+alen; k++) {
+				double in2 = b.quickGetValue(i, aix[k]);
+				if( in2==0 ) continue;
+				double val = op.fn.execute(avals[k], in2);
+				ret.appendValue(i, aix[k], val);
+			}
+		}
+	}
+	
+	private static void safeBinaryMMGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) 
+		throws DMLRuntimeException 
+	{
+		int rlen = m1.rlen;
+		int clen = m2.clen;
+		for(int r=0; r<rlen; r++)
+			for(int c=0; c<clen; c++) {
+				double in1 = m1.quickGetValue(r, c);
+				double in2 = m2.quickGetValue(r, c);
+				if( in1==0 && in2==0) continue;
+				double val = op.fn.execute(in1, in2);
+				ret.appendValue(r, c, val);
+			}
+	}
+	
 	/**
 	 * 
 	 * This will do cell wise operation for &lt;, &lt;=, &gt;, &gt;=, == and != operators.
@@ -1254,6 +1274,4 @@ public class LibMatrixBincell
 			result.appendValue(resultRow, cols2[j], v);
 		}
 	}
-	
 }
-

http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fee73c5..eca26f6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -632,7 +632,7 @@ public class LibMatrixMult
 		ret.allocateBlock();
 		
 		try 
-		{			
+		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
 			int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -927,169 +927,189 @@ public class LibMatrixMult
 
 	private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException
-	{			
+	{
 		double[] a = m1.denseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
 		final int n = m2.clen;
 		final int cd = m1.clen;
-
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			if( m==1 && n==1 ) 		      //DOT PRODUCT
-			{
+		
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
 				c[0] = dotProduct(a, b, cd);
 			}
-			else if( n>1 && cd == 1 )     //OUTER PRODUCT
-			{
+			else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
 				for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
 					if( a[i] == 1 )
 						System.arraycopy(b, 0, c, cix, n);
-				    else if( a[i] != 0 )
+					else if( a[i] != 0 )
 						vectMultiplyWrite(a[i], b, c, 0, cix, n);
 					else
 						Arrays.fill(c, cix, cix+n, 0);
 				}
 			}
-			else if( n==1 && cd == 1 )    //VECTOR-SCALAR
-			{
+			else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
 				vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
-					c[i] = dotProduct(a, b, aix, 0, cd);	
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = 2*1024; //16KB vector blocks (L1) 
-				for( int bi=rl; bi<ru; bi+=blocksizeI ) {
-					int bimin = Math.min(bi+blocksizeI, ru);
-					for( int bk=0; bk<cd; bk+=blocksizeK ) {
-						int bkmin = Math.min(bk+blocksizeK, cd);
-						for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
-							c[i] += dotProduct(a, b, aix, bk, bkmin-bk);	
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				//rest not aligned to blocks of 2 rows
-				final int kn = (ru-rl)%2;
-				if( kn == 1 && a[rl] != 0 )
-					vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-				
-				//compute blocks of 2 rows (2 instead of 4 for small n<64) 
-				for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
-					if( a[k] != 0 && a[k+1] != 0  )
-						vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
-					else if( a[k] != 0 )
-						vectMultiplyAdd(a[k], b, c, bix, 0, n);
-					else if( a[k+1] != 0 )	
-						vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				//cache-conscious parallelization over rows in rhs matrix
-				final int kn = (ru-rl)%4;				
-				
-				//rest not aligned to blocks of 2 rows
-				for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
-					for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
-						if( a[aix+k] != 0 )
-							vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
-				
-				final int blocksizeK = 48;
-				final int blocksizeJ = 1024;
-				
-				//blocked execution
-				for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
-					for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) 
-					{
-						//compute blocks of 4 rows in rhs w/ IKJ
-						int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-						for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
-							for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
-								vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
-										b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
-							}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
 			}
-			else if( tm2 )                //MATRIX-MATRIX (skinny rhs)
-			{
-				//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
-				//however, explicit flag required since dimension change m2
-				final int n2 = m2.rlen;
-				for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
-					for( int j=0, bix=0; j<n2; j++, bix+=cd )
-						c[cix+j] = dotProduct(a, b, aix, bix, cd);
-			}
-			else                          //MATRIX-MATRIX
-			{	
-				//1) Unrolled inner loop (for better instruction-level parallelism)
-				//2) Blocked execution (for less cache trashing in parallel exec) 	
-				//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-				
-				final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
-				final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-				final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
-
-				//temporary arrays (nnz a, b index)
-				double[] ta = new double[ blocksizeK ];
-				int[]  tbi  = new int[ blocksizeK ];
-				
-				//blocked execution
-				for( int bi = rl; bi < ru; bi+=blocksizeI )
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-						for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
-						{
-							int bklen = bkmin-bk;
-							int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-				    		for( int i = bi; i < bimin; i++) 
-				    		{
-				    			int aixi = i * cd + bk; //start index on a
-				    			int cixj = i * n + bj; //scan index on c
-				    			
-				    			//determine nnz of a (for sparsity-aware skipping of rows)
-				    			int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
-				    			//if( knnz > 0 ) //for skipping empty rows
-				    			
-			    				//rest not aligned to blocks of 4 rows
-				    			final int bn = knnz % 4;
-				    			switch( bn ){
-					    			case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
-					    	    	case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
-					    			case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
-				    			}
-				    			
-				    			//compute blocks of 4 rows (core inner loop)
-				    			for( int k = bn; k<knnz; k+=4 ){
-				    				vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
-				    						          tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-				    			}
-				    		}
-						}
+			else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
+				matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
+			}
+			else {                          //MATRIX-MATRIX
+				matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
 			}
 		}
-		else
-		{
-			double val;
+		else {
 			for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n) 
-				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
-				{			
-					val = a[ aix ];
+				for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n) {
+					double val = a[ aix ];
 					if( val != 0 )
 						for( int j = 0; j < n; j++) 
 							c[ cix+j ] += val * b[ bix+j ];
-				}	
+				}
 		}
+	}
+	
+	private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
+			c[i] = dotProduct(a, b, aix, 0, cd);
+	}
+	
+	private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		final int blocksizeI = 32;
+		final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+		for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+			int bimin = Math.min(bi+blocksizeI, ru);
+			for( int bk=0; bk<cd; bk+=blocksizeK ) {
+				int bkmin = Math.min(bk+blocksizeK, cd);
+				for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd) 
+					c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+			}
+		}
+	}
+	
+	private static void matrixMultDenseDenseVM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//parallelization over rows in rhs matrix
+		//rest not aligned to blocks of 2 rows
+		final int kn = (ru-rl)%2;
+		if( kn == 1 && a[rl] != 0 )
+			vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
 		
+		//compute blocks of 2 rows (2 instead of 4 for small n<64) 
+		for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
+			if( a[k] != 0 && a[k+1] != 0  )
+				vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
+			else if( a[k] != 0 )
+				vectMultiplyAdd(a[k], b, c, bix, 0, n);
+			else if( a[k+1] != 0 )	
+				vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+		}
+	}
+	
+	private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] b, double[] c, int m, int n, int cd, int rl, int ru)
+		throws DMLRuntimeException
+	{
+		//cache-conscious parallelization over rows in rhs matrix
+		final int kn = (ru-rl)%4;
+		
+		//rest not aligned to blocks of 2 rows
+		for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
+			for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
+				if( a[aix+k] != 0 )
+					vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
+		
+		final int blocksizeK = 48;
+		final int blocksizeJ = 1024;
+		
+		//blocked execution
+		for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) 
+			for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+				//compute blocks of 4 rows in rhs w/ IKJ
+				int bjlen = Math.min(n, bj+blocksizeJ)-bj;
+				for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
+					for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
+						vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
+							b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
+					}
+			}
+	}
+	
+	private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, double[] b, double[] c, int n2, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
+		//however, explicit flag required since dimension change m2
+		for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 ) 
+			for( int j=0, bix=0; j<n2; j++, bix+=cd )
+				c[cix+j] = dotProduct(a, b, aix, bix, cd);
+	}
+	
+	private static void matrixMultDenseDenseMM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru, int cl, int cu) 
+		throws DMLRuntimeException
+	{
+		//1) Unrolled inner loop (for better instruction-level parallelism)
+		//2) Blocked execution (for less cache trashing in parallel exec) 	
+		//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
+		
+		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
+		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
+		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
+
+		//temporary arrays (nnz a, b index)
+		double[] ta = new double[ blocksizeK ];
+		int[]  tbi  = new int[ blocksizeK ];
+		
+		//blocked execution
+		for( int bi = rl; bi < ru; bi+=blocksizeI )
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
+				for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ ) 
+				{
+					int bklen = bkmin-bk;
+					int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
+					
+					//core sub block matrix multiplication
+					for( int i = bi; i < bimin; i++) 
+					{
+						int aixi = i * cd + bk; //start index on a
+						int cixj = i * n + bj; //scan index on c
+						
+						//determine nnz of a (for sparsity-aware skipping of rows)
+						int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+						//if( knnz > 0 ) //for skipping empty rows
+						
+						//rest not aligned to blocks of 4 rows
+						final int bn = knnz % 4;
+						switch( bn ){
+							case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
+							case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
+							case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
+						}
+						
+						//compute blocks of 4 rows (core inner loop)
+						for( int k = bn; k<knnz; k+=4 ){
+							vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
+									tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+						}
+					}
+				}
 	}
 
 	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
@@ -1163,7 +1183,8 @@ public class LibMatrixMult
 
 	private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
 		throws DMLRuntimeException
-	{	
+	{
+		SparseBlock a = m1.sparseBlock;
 		double[] b = m2.denseBlock;
 		double[] c = ret.denseBlock;
 		final int m = m1.rlen;
@@ -1171,179 +1192,195 @@ public class LibMatrixMult
 		final int cd = m2.rlen;
 		final long xsp = (long)m*cd/m1.nonZeros;
 
-		if( LOW_LEVEL_OPTIMIZATION )
-		{
-			SparseBlock a = m1.sparseBlock;
-			
-			if( m==1 && n==1 )            //DOT PRODUCT
-			{
-				if( !a.isEmpty(0) ) {
+		if( LOW_LEVEL_OPTIMIZATION ) {
+			if( m==1 && n==1 ) {            //DOT PRODUCT
+				if( !a.isEmpty(0) )
 					c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
-				}
 			}
-			else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
-			{
-				for( int i=rl; i<ru; i++ )
-					if( !a.isEmpty(i) )
-						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+			else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+				matrixMultSparseDenseMVShortRHS(a, b, c, rl, ru);
 			}
-			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
-			{
-				final int blocksizeI = 32;
-				final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1  
-				int[] curk = new int[blocksizeI];
-				
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
-						for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
-							if( a.isEmpty(i) ) continue;
-							int apos = a.pos(i);
-							int alen = a.size(i);
-							int[] aix = a.indexes(i);
-							double[] avals = a.values(i);
-							int k = curk[i-bi] + apos;
-							for( ; k<apos+alen && aix[k]<bkmin; k++ )
-								c[i] += avals[k] * b[aix[k]];
-							curk[i-bi] = k - apos;
-						}
-					}
-				}
+			else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+				matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
 			}
-			else if( pm2 && m==1 )        //VECTOR-MATRIX
-			{
-				//parallelization over rows in rhs matrix
-				if( !a.isEmpty(0) ) 
-				{
-					int alen = a.size(0);
-					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);
-					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
-					rlix = (rlix>=0) ? rlix : alen;
-					
-					for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
-						if( k+1<alen && aix[k+1]<ru )
-							vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
-						else
-							vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
-					}
-				}
+			else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+				matrixMultSparseDenseVM(a, b, c, n, rl, ru);
 			}
-			else if( pm2 && m<=16 )       //MATRIX-MATRIX (short lhs) 
-			{
-				int arlen = a.numRows();
-				for( int i=0, cix=0; i<arlen; i++, cix+=n )
-					if( !a.isEmpty(i) ) 
-					{
-						int apos = a.pos(i);
-						int alen = a.size(i);
-						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);
-						
-						int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
-						k1 = (k1>=0) ? apos+k1 : apos+alen;
-						int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
-						k2 = (k2>=0) ? apos+k2 : apos+alen;
-						
-						//rest not aligned to blocks of 4 rows
-		    			final int bn = (k2-k1) % 4;
-		    			switch( bn ){
-			    			case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
-			    	    	case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
-			    			case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
-		    			}
-		    			
-		    			//compute blocks of 4 rows (core inner loop)
-		    			for( int k = k1+bn; k<k2; k+=4 ) {
-		    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-		    						          aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-		    			}
-					}
+			else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+				matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
 			}
-			else if( n<=64 )              //MATRIX-MATRIX (skinny rhs)
-			{
-				//no blocking since b and c fit into cache anyway
-				for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
-					if( a.isEmpty(i) ) 
-						continue;
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);
-					//rest not aligned to blocks of 4 rows
-					int bn = alen%4;
-					for( int k=apos; k<apos+bn; k++ )
-	    				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n); 
-	    			//compute blocks of 4 rows (core inner loop)
-	    			for( int k=apos+bn; k<apos+alen; k+=4 )
-	    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-	    					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
-				}	
+			else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
+				matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
 			}
-			else                          //MATRIX-MATRIX
-			{
-				//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
-				//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
-				//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
-				//output - however, in this case it's unlikely that we consume every cache line in the rhs
-				
-				final int blocksizeI = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeK = (int) (8L*m*cd/m1.nonZeros);
-				final int blocksizeJ = 1024; 
-				
-				//temporary array of current sparse positions
-				int[] curk = new int[blocksizeI];
-				
-				//blocked execution over IKJ 
-				for( int bi = rl; bi < ru; bi+=blocksizeI ) {
-					Arrays.fill(curk, 0); //reset positions
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
-						for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
-							int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-							
-							//core sub block matrix multiplication
-							for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
-								if( !a.isEmpty(i) ) {
-									int apos = a.pos(i);
-									int alen = a.size(i);
-									int[] aix = a.indexes(i);
-									double[] avals = a.values(i);
-									
-									int k = curk[i-bi] + apos;
-					    			//rest not aligned to blocks of 4 rows
-									int bn = alen%4;
-									for( ; k<apos+bn && aix[k]<bkmin; k++ )
-					    				vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
-					    			//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
-					    			for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
-					    				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
-					    					aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
-					    			//update positions on last bj block
-					    			if( bj+bjlen==n )
-					    				curk[i-bi] = k - apos;
-								}
-							}
-						}
-					}
+			else {                          //MATRIX-MATRIX
+				matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
+			}
+		}
+		else {
+			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue; 
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				for(int k = apos; k < apos+alen; k++) {
+					double val = avals[k];
+					for(int j = 0, bix=aix[k]*n; j < n; j++)
+						c[cix+j] += val * b[bix+j];
 				}
 			}
 		}
-		else
-		{
-			SparseBlock a = m1.sparseBlock;
-			for( int i=rl, cix=rl*n; i<ru; i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
+	}
+	
+	private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, double[] b, double[] c, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		for( int i=rl; i<ru; i++ )
+			if( !a.isEmpty(i) )
+				c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+	}
+	
+	private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, double[] b, double[] c, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		final int blocksizeI = 32;
+		final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
+		int[] curk = new int[blocksizeI];
+		
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+				for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+					if( a.isEmpty(i) ) continue;
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
 					double[] avals = a.values(i);
+					int k = curk[i-bi] + apos;
+					for( ; k<apos+alen && aix[k]<bkmin; k++ )
+						c[i] += avals[k] * b[aix[k]];
+					curk[i-bi] = k - apos;
+				}
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{
+		if( a.isEmpty(0) )
+			return;
+		
+		//parallelization over rows in rhs matrix
+		int alen = a.size(0);
+		int[] aix = a.indexes(0);
+		double[] avals = a.values(0);
+		int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
+		rlix = (rlix>=0) ? rlix : alen;
+		
+		for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
+			if( k+1<alen && aix[k+1]<ru )
+				vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
+			else
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, double[] b, double[] c, int n, int cd, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		int arlen = a.numRows();
+		for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			
+			int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
+			k1 = (k1>=0) ? apos+k1 : apos+alen;
+			int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
+			k2 = (k2>=0) ? apos+k2 : apos+alen;
+			
+			//rest not aligned to blocks of 4 rows
+			final int bn = (k2-k1) % 4;
+			switch( bn ){
+				case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
+				case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+				case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+			}
+			
+			//compute blocks of 4 rows (core inner loop)
+			for( int k = k1+bn; k<k2; k+=4 ) {
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+			}
+		}
+	}
+	
+	private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, double[] b, double[] c, int n, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//no blocking since b and c fit into cache anyway
+		for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+			if( a.isEmpty(i) ) continue;
+			int apos = a.pos(i);
+			int alen = a.size(i);
+			int[] aix = a.indexes(i);
+			double[] avals = a.values(i);
+			//rest not aligned to blocks of 4 rows
+			int bn = alen%4;
+			for( int k=apos; k<apos+bn; k++ )
+				vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
+			//compute blocks of 4 rows (core inner loop)
+			for( int k=apos+bn; k<apos+alen; k+=4 )
+				vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+					aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+		}
+	}
+	
+	private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, double[] c, int n, int cd, long xsp, int rl, int ru) 
+		throws DMLRuntimeException
+	{	
+		//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
+		//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
+		//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
+		//output - however, in this case it's unlikely that we consume every cache line in the rhs
+		final int blocksizeI = (int) (8L*xsp);
+		final int blocksizeK = (int) (8L*xsp);
+		final int blocksizeJ = 1024; 
+		
+		//temporary array of current sparse positions
+		int[] curk = new int[blocksizeI];
+		
+		//blocked execution over IKJ 
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+			Arrays.fill(curk, 0); //reset positions
+			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+				for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+					int bjlen = Math.min(n, bj+blocksizeJ)-bj;
 					
-					for(int k = apos; k < apos+alen; k++) {
-						double val = avals[k];
-						for(int j = 0, bix=aix[k]*n; j < n; j++)
-							c[cix+j] += val * b[bix+j];
+					//core sub block matrix multiplication
+					for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
+						if( !a.isEmpty(i) ) {
+							int apos = a.pos(i);
+							int alen = a.size(i);
+							int[] aix = a.indexes(i);
+							double[] avals = a.values(i);
+							
+							int k = curk[i-bi] + apos;
+							//rest not aligned to blocks of 4 rows
+							int bn = alen%4;
+							for( ; k<apos+bn && aix[k]<bkmin; k++ )
+								vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
+							//compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
+							for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
+								vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+									aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
+							//update positions on last bj block
+							if( bj+bjlen==n )
+								curk[i-bi] = k - apos;
+						}
 					}
 				}
 			}
@@ -1408,8 +1445,8 @@ public class LibMatrixMult
 								int[] aix = a.indexes(i);
 								double[] avals = a.values(i);	
 								
-								int k = curk[i-bi] + apos;									
-				    			for(; k < apos+alen && aix[k]<bkmin; k++) {
+								int k = curk[i-bi] + apos;
+								for(; k < apos+alen && aix[k]<bkmin; k++) {
 									if( !b.isEmpty(aix[k]) )
 										vectMultiplyAdd(avals[k], b.values(aix[k]), c, 
 											b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
@@ -1423,28 +1460,22 @@ public class LibMatrixMult
 		}
 		else
 		{
-			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n )
-			{
-				if( !a.isEmpty(i) ) 
-				{
-					int apos = a.pos(i);
-					int alen = a.size(i);
-					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
-					
-					for(int k = apos; k < apos+alen; k++) 
-					{
-						double val = avals[k];
-						if( !b.isEmpty(aix[k]) ) 
-						{
-							int bpos = b.pos(aix[k]);
-							int blen = b.size(aix[k]);
-							int[] bix = b.indexes(aix[k]);
-							double[] bvals = b.values(aix[k]);	
-							for(int j = bpos; j < bpos+blen; j++)
-								c[cix+bix[j]] += val * bvals[j];								
-						}
-					}						
+			for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n ) {
+				if( a.isEmpty(i) ) continue;
+				int apos = a.pos(i);
+				int alen = a.size(i);
+				int[] aix = a.indexes(i);
+				double[] avals = a.values(i);
+				
+				for(int k = apos; k < apos+alen; k++) {
+					if( b.isEmpty(aix[k]) ) continue;
+					double val = avals[k];
+					int bpos = b.pos(aix[k]);
+					int blen = b.size(aix[k]);
+					int[] bix = b.indexes(aix[k]);
+					double[] bvals = b.values(aix[k]);
+					for(int j = bpos; j < bpos+blen; j++)
+						c[cix+bix[j]] += val * bvals[j];
 				}
 			}
 		}