You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/02/10 07:02:35 UTC
[2/3] incubator-systemml git commit: [SYSTEMML-1078] Fix missing nnz maintenance conv2d ops, incl cleanups

[SYSTEMML-1078] Fix missing nnz maintenance conv2d ops, incl cleanups

This patch extends all conv2d operations by (so far unoptimized) nnz
maintenance in order to prevent side effects with update-in-place and
other operations that incrementally maintain the number of non-zeros. 


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d0b23d60
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d0b23d60
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d0b23d60

Branch: refs/heads/master
Commit: d0b23d607998e0bebd3d9d051faf748dd5530ce8
Parents: 827cdba
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Feb 10 05:50:52 2017 +0100
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Feb 10 07:55:52 2017 +0100

----------------------------------------------------------------------
 .../runtime/controlprogram/ProgramBlock.java    |   4 +-
 .../cp/ConvolutionCPInstruction.java            |  90 ++++---
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 233 ++++++++++---------
 3 files changed, 171 insertions(+), 156 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d0b23d60/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
index eb504ca..739b1cf 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ProgramBlock.java
@@ -400,10 +400,10 @@ public class ProgramBlock
 					
 					if( nnz1 != nnz2 )
 						throw new DMLRuntimeException("Matrix nnz meta data was incorrect: ("+varname+", actual="+nnz1+", expected="+nnz2+", inst="+lastInst+")");
-							
 					
 					if( sparse1 != sparse2 )
-						throw new DMLRuntimeException("Matrix was in wrong data representation: ("+varname+", actual="+sparse1+", expected="+sparse2+", nnz="+nnz1+", inst="+lastInst+")");
+						throw new DMLRuntimeException("Matrix was in wrong data representation: ("+varname+", actual="+sparse1+", expected="+sparse2 + 
+								", nrow="+mb.getNumRows()+", ncol="+mb.getNumColumns()+", nnz="+nnz1+", inst="+lastInst+")");
 				}
 			}
 		}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d0b23d60/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index ed0b548..3513201 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,8 +21,6 @@ package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
 import java.util.Arrays;
-import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
@@ -33,8 +31,8 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 
-public class ConvolutionCPInstruction extends UnaryCPInstruction {
-	
+public class ConvolutionCPInstruction extends UnaryCPInstruction 
+{	
 	private CPOperand _in2;
 	private CPOperand _in3; 
 	private ArrayList<CPOperand> _input_shape;
@@ -101,8 +99,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 
 	public static ConvolutionCPInstruction parseInstruction(String str)
 			throws DMLRuntimeException {
-		CPOperand in = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
-		CPOperand out = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
 
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 		String opcode = parts[0];
@@ -111,8 +107,8 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			// stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
-			in.split(parts[1]);
-			out.split(parts[14]);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand out = new CPOperand(parts[14]);
 
 			ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
 			ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
@@ -143,10 +139,9 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
-			in.split(parts[1]);
-			CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
-			in2.split(parts[2]);
-			out.split(parts[15]);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand out = new CPOperand(parts[15]);
 
 			ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
 			ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
@@ -174,12 +169,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			// dout, stride1, stride2, padding1, padding2
 			// input_shape1, input_shape2, input_shape3, input_shape4,
 			// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
-			in.split(parts[1]);
-			CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
-			in2.split(parts[2]);
-			CPOperand in3 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
-			in3.split(parts[3]);
-			out.split(parts[16]);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = new CPOperand(parts[3]);
+			CPOperand out = new CPOperand(parts[16]);
 
 			ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
 			ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
@@ -204,10 +197,9 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		}
 		else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward")) {
 			InstructionUtils.checkNumFields(parts, 4);
-			in.split(parts[1]);
-			CPOperand in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
-			in2.split(parts[2]);
-			out.split(parts[3]);
+			CPOperand in = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand out = new CPOperand(parts[3]);
 			int k = Integer.parseInt(parts[4]);
 			return new ConvolutionCPInstruction(in, in2, out, opcode, str, k);
 		}
@@ -216,24 +208,23 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		}
 	}
 
-	private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL,
-			int index) throws DMLRuntimeException {
+	private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index) 
+			throws DMLRuntimeException {
 		return (int) ec.getScalarInput(aL.get(index).getName(),
 				aL.get(index).getValueType(), aL.get(index).isLiteral())
 				.getLongValue();
 	}
 	
+	@SuppressWarnings("unused")
 	public void processReluBackwardInstruction(ExecutionContext ec) throws DMLRuntimeException {
 		// (X > 0) * dout
-		MatrixBlock outputBlock = null;
 		MatrixBlock input = ec.getMatrixInput(input1.getName());
 		MatrixBlock dout = ec.getMatrixInput(_in2.getName());
+		MatrixBlock outputBlock =  new MatrixBlock(input.getNumRows(), input.getNumColumns(), 
+			LibMatrixDNN.SUPPORTS_SPARSE_OUTPUTS && (input.isInSparseFormat() || dout.isInSparseFormat()));
 		
-		if(input.isEmptyBlock() || dout.isEmptyBlock()) {
-			outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true, 0);
-		}
-		else {
-			outputBlock = getDenseOutputBlock(ec, input.getNumRows(), input.getNumColumns());
+		if( !input.isEmptyBlock() && !dout.isEmptyBlock() ) {
+			outputBlock.allocateDenseOrSparseBlock();
 			LibMatrixDNN.reluBackward(input, dout, outputBlock, _numThreads);
 		}
 		
@@ -244,24 +235,24 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 	}
 	
 	public void processBiasAddInstruction(ExecutionContext ec) throws DMLRuntimeException {
-		MatrixBlock outputBlock = null;
 		MatrixBlock input = ec.getMatrixInput(input1.getName());
 		MatrixBlock bias = ec.getMatrixInput(_in2.getName());
+		MatrixBlock outputBlock = null;
 		
 		if(bias.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Expected the number of columns of bias matrix to be 1, but found " + bias.getNumColumns());
 		}
 		
 		if(input.isEmptyBlock() && bias.isEmptyBlock()) {
-			outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true, 0);
+			outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), true);
 		}
 		else if(bias.isEmptyBlock()) {
-			outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), input.isInSparseFormat());
-			outputBlock.copy(input);
+			outputBlock = new MatrixBlock(input);
 		}
 		else {
 			// As we always fill the output first with bias
-			outputBlock = getDenseOutputBlock(ec, input.getNumRows(), input.getNumColumns());
+			outputBlock = new MatrixBlock(input.getNumRows(), input.getNumColumns(), false);
+			outputBlock.allocateDenseBlock();
 			LibMatrixDNN.biasAdd(input, bias, outputBlock, _numThreads);
 		}
 		
@@ -307,10 +298,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
 		if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
 			if(matBlock.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(N, C*P*Q, true, 0);
+				outputBlock = new MatrixBlock(N, C*P*Q, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, N, C*P*Q);
+				outputBlock = getDenseOutputBlock(N, C*P*Q);
 				if(instOpcode.equalsIgnoreCase("maxpooling"))
 					Arrays.fill(outputBlock.getDenseBlock(), -Double.MAX_VALUE);
 				LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
@@ -319,10 +310,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixBlock dout = ec.getMatrixInput(_in2.getName());
 			if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(N, C*H*W, true, 0);
+				outputBlock = new MatrixBlock(N, C*H*W, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, N, C*H*W);
+				outputBlock = getDenseOutputBlock(N, C*H*W);
 				LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
@@ -330,10 +321,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		else if (instOpcode.equalsIgnoreCase("conv2d")) {
 			MatrixBlock filter = ec.getMatrixInput(_in2.getName());
 			if(filter.isEmptyBlock() || matBlock.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(N, K*P*Q, true, 0);
+				outputBlock = new MatrixBlock(N, K*P*Q, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, N, K*P*Q);
+				outputBlock = getDenseOutputBlock(N, K*P*Q);
 				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
@@ -342,10 +333,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 			MatrixBlock filter = ec.getMatrixInput(_in3.getName());
 			MatrixBlock bias = ec.getMatrixInput(_in2.getName());
 			if((filter.isEmptyBlock() || matBlock.isEmptyBlock()) && bias.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(N, K*P*Q, true, 0);
+				outputBlock = new MatrixBlock(N, K*P*Q, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, N, K*P*Q);
+				outputBlock = getDenseOutputBlock(N, K*P*Q);
 				if(!bias.isEmptyBlock())
 					params.bias = bias;
 				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
@@ -356,10 +347,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
 			MatrixBlock dout = ec.getMatrixInput(_in2.getName());
 			if(dout.isEmptyBlock() || matBlock.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(K, C*R*S, true, 0);
+				outputBlock = new MatrixBlock(K, C*R*S, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, K, C*R*S);
+				outputBlock = getDenseOutputBlock(K, C*R*S);
 				LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
@@ -367,10 +358,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
 			MatrixBlock dout = ec.getMatrixInput(_in2.getName());
 			if(dout.isEmptyBlock() || matBlock.isEmptyBlock()) {
-				outputBlock = new MatrixBlock(N, C * H * W, true, 0);
+				outputBlock = new MatrixBlock(N, C * H * W, true);
 			}
 			else {
-				outputBlock = getDenseOutputBlock(ec, N, C * H * W);
+				outputBlock = getDenseOutputBlock(N, C * H * W);
 				LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
@@ -384,10 +375,9 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock);
 	}
 	
-	private MatrixBlock getDenseOutputBlock(ExecutionContext ec, int numRows, int numCols) throws DMLRuntimeException {
-		MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false, numRows * numCols);
+	private MatrixBlock getDenseOutputBlock(int numRows, int numCols) throws DMLRuntimeException {
+		MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false);
 		outputBlock.allocateDenseBlock();
-		outputBlock.setNonZeros(-1);
 		return outputBlock;
 	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d0b23d60/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 9207171..29e59bd 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -24,7 +24,6 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -48,13 +47,14 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 public class LibMatrixDNN {
 	
 	protected static final Log LOG =  LogFactory.getLog(LibMatrixDNN.class.getName());
-	// ------------------------------------------------------------------------------------------------
-	// Useful flags for performance testing:
-	private static boolean DISPLAY_STATISTICS = false;
-	private static final boolean ALLOW_MULTI_THREADED_OPS = true;
-	// ------------------------------------------------------------------------------------------------
 	
-	enum TaskType {
+	//library configurations and external contracts
+	public static final boolean SUPPORTS_SPARSE_OUTPUTS = false; //operations able to handle sparse outputs 
+	private static final boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output
+	private static final boolean ALLOW_MULTI_THREADED_OPS = true; //enable multi-threading in cp
+	private static final int NUM_TASK_FACTOR = 2; //number of tasks is vcores scaled by this factor
+	
+	private enum TaskType {
 		MaxPooling_Forward, MaxPooling_Backward, 
 		// Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
 		LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData,
@@ -79,6 +79,7 @@ public class LibMatrixDNN {
 	private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0);
 	private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);
 	
+	@SuppressWarnings("unused")
 	public static void appendStatistics(StringBuilder sb) {
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
 			sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" 
@@ -135,6 +136,7 @@ public class LibMatrixDNN {
 	 * @param params convolution parameters
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
+	@SuppressWarnings("unused")
 	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = filter;
 		params.input2 = dout;
@@ -157,6 +159,9 @@ public class LibMatrixDNN {
 		}
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
 	}
 	
 	/**
@@ -168,6 +173,7 @@ public class LibMatrixDNN {
 	 * @param params convolution parameters
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
+	@SuppressWarnings("unused")
 	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
@@ -190,6 +196,9 @@ public class LibMatrixDNN {
 		}
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
 	}
 	
 	/**
@@ -259,6 +268,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
+	@SuppressWarnings("unused")
 	private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params) throws DMLRuntimeException {
 		MatrixBlock filter = params.input1;
 		MatrixBlock dout = params.input2;
@@ -277,6 +287,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
+	@SuppressWarnings("unused")
 	private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n, 
 			MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
@@ -306,6 +317,7 @@ public class LibMatrixDNN {
 		ret[2] = j % W;
 	}
 	
+	@SuppressWarnings("unused")
 	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = filter;
@@ -333,8 +345,12 @@ public class LibMatrixDNN {
 		}
 		
 		runConvTask(TaskType.LoopedIm2ColConv2d, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
 	}
 	
+	@SuppressWarnings("unused")
 	private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		doIm2col(n, im2ColOutBlock, params);
@@ -372,6 +388,9 @@ public class LibMatrixDNN {
 				System.arraycopy(matMultOutBlock.denseBlock, 0, params.output.denseBlock, destPos, length);
 		}
 		// -----------------------------------------------------------------------------
+		
+		//post-processing: maintain nnz
+		params.output.recomputeNonZeros(); 
 	}
 	
 	/**
@@ -383,6 +402,7 @@ public class LibMatrixDNN {
 	 * @param params convolution parameters
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
+	@SuppressWarnings("unused")
 	public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
@@ -409,6 +429,9 @@ public class LibMatrixDNN {
 
 		fillIndexesArray(params);
 		runConvTask(TaskType.MaxPooling_Backward, params);
+		
+		//post-processing: maintain nnz 
+		outputBlock.recomputeNonZeros();
 	}
 	
 	private static void fillIndexesArray(ConvolutionParameters params) {
@@ -611,10 +634,13 @@ public class LibMatrixDNN {
 			throw new DMLRuntimeException("Incorrect dimensions for relu_backward:" + 
 				input.getNumRows() + " != " + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns());
 		}
+		
 		runConvTask(TaskType.ReluBackward, params);
+		
+		//note: no post-processing as nnz maintained per task
 	}
 	
-	private static void doReluBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
+	private static long doReluBackward(ConvolutionParameters params, int rl, int ru) throws DMLRuntimeException {
 		// (X > 0) * dout
 		double [] outputArray = params.output.getDenseBlock();
 		int numOutCols = params.input1.getNumColumns();
@@ -622,14 +648,14 @@ public class LibMatrixDNN {
 		if(!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) {
 			double [] inputArr = params.input1.getDenseBlock();
 			double [] doutArr = params.input2.getDenseBlock();
-			for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
+			for(int i = rl*numOutCols; i < ru*numOutCols; i++) {
 				outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0;
 			}
 		}
 		else {
 			// Perform (X > 0)
 			if(params.input1.isInSparseFormat()) {
-				Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1);
+				Iterator<IJV> iter = params.input1.sparseBlock.getIterator(rl, ru);
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int i = ijv.getI();
@@ -639,13 +665,13 @@ public class LibMatrixDNN {
 			}
 			else {
 				double [] inputArr = params.input1.getDenseBlock();
-				for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
+				for(int i = rl*numOutCols; i < ru*numOutCols; i++) {
 					outputArray[i] = inputArr[i] > 0 ? 1 : 0;
 				}
 			}
 			// Then perform (X > 0) * dout
 			if(params.input2.isInSparseFormat()) {
-				Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n+1);
+				Iterator<IJV> iter = params.input2.sparseBlock.getIterator(rl, ru);
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int i = ijv.getI();
@@ -655,11 +681,14 @@ public class LibMatrixDNN {
 			}
 			else {
 				double [] doutArr = params.input2.getDenseBlock();
-				for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
+				for(int i = rl*numOutCols; i < ru*numOutCols; i++) {
 					outputArray[i] *= doutArr[i];
 				}
 			}
 		}
+		
+		//post-processing: maintain nnz
+		return params.output.recomputeNonZeros(rl, ru-1, 0, numOutCols-1);
 	}
 	
 	
@@ -704,9 +733,12 @@ public class LibMatrixDNN {
 		else {
 			runConvTask(TaskType.BiasAdd, params);
 		}
+		
+		//post-processing: maintain nnz
+		params.output.recomputeNonZeros();
 	}
 	
-	private static void doBiasAdd(int n1, int n2, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doBiasAdd(ConvolutionParameters params, int rl, int ru) throws DMLRuntimeException {
 		double [] outputArray = params.output.getDenseBlock();
 		int PQ = params.C;
 		int numOutCols = params.input1.getNumColumns();
@@ -715,8 +747,8 @@ public class LibMatrixDNN {
 			double [] inputArr = params.input1.getDenseBlock();
 			double [] biasArr = params.input2.getDenseBlock();
 			int K = params.K;
-			int index = n1*K*PQ;
-			for(int n = n1; n < n2; n++) {
+			int index = rl*K*PQ;
+			for(int n = rl; n < ru; n++) {
 				for(int k = 0; k < K; k++) {
 					for(int pq = 0; pq < PQ; pq++, index++) {
 						outputArray[index] = inputArr[index] + biasArr[k];
@@ -725,9 +757,9 @@ public class LibMatrixDNN {
 			}
 		}
 		else {
-			fillBias(params.input2, outputArray, n1, n2, params.N, params.K, PQ);
+			fillBias(params.input2, outputArray, rl, ru, params.N, params.K, PQ);
 			if(params.input1.isInSparseFormat()) {
-				Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n1, n2);
+				Iterator<IJV> iter = params.input1.sparseBlock.getIterator(rl, ru);
 				while(iter.hasNext()) {
 					IJV ijv = iter.next();
 					int i = ijv.getI();
@@ -737,7 +769,7 @@ public class LibMatrixDNN {
 			}
 			else {
 				double [] inputArr = params.input1.getDenseBlock();
-				for(int i = n1*numOutCols; i < n2*numOutCols; i++) {
+				for(int i = rl*numOutCols; i < ru*numOutCols; i++) {
 					outputArray[i] += inputArr[i];
 				}
 			}
@@ -780,6 +812,9 @@ public class LibMatrixDNN {
 		
 		fillIndexesArray(params);
 		runConvTask(TaskType.MaxPooling_Forward, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
 	}
 
 	private static void doPooling(int n, ConvolutionParameters params) throws DMLRuntimeException {
@@ -872,75 +907,63 @@ public class LibMatrixDNN {
 		for(int i = 0; i < poolSize; i++) {
 			if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
 				MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-				im2ColOutBlock.allocateDenseBlock(true);
+				im2ColOutBlock.allocateDenseBlock();
 				im2ColOutBlocks.add(im2ColOutBlock);
 			}
 			
 			if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
 				MatrixBlock partialRetBlock = new MatrixBlock(params.C*params.R*params.S, params.K, false);
-				partialRetBlock.allocateDenseBlock(true);
+				partialRetBlock.allocateDenseBlock();
 				partialRetBlocks.add(partialRetBlock);
 			}
 			
 			if(type == TaskType.LoopedIm2ColConv2dBwdData || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
 				MatrixBlock doutReshapedBlock = new MatrixBlock(params.P*params.Q, params.K, false);
-				doutReshapedBlock.allocateDenseBlock(true);
+				doutReshapedBlock.allocateDenseBlock();
 				doutReshapedBlocks.add(doutReshapedBlock);
 			}
 		}
 	}
 	// Methods to execute convolution-related tasks using multiple threads.
 	private static void runConvTask(TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
-		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+		int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
 		ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
 		ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
 		ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks = new ConcurrentLinkedQueue<MatrixBlock>();
-		if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1) {
-			int poolSize = Math.min(constrainedNumThreads, params.N);
+		
+		if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && k > 1) {
+			int poolSize = Math.min(k, params.N);
 			addMatrixBlocks(poolSize, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
+			
 			ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
-			int NSize = params.N - poolSize;
-			if(NSize >= constrainedNumThreads) {
-				for(int n = 0; n < params.N; n++) 
-					tasks.add(new ConvTask(n, n+1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
-			}
-			else {
-				int numNTasks = (int) Math.ceil(((double) NSize) / constrainedNumThreads);
-				for (int n = 0; n < NSize; n += numNTasks) {
-					tasks.add(new ConvTask(n, Math.min(NSize, n+numNTasks), type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
-				}
-				for (int n = NSize; n < params.N; n++)
-					tasks.add(new ConvTask(n, n+1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
-			}
+			int blklen = (int)(Math.ceil((double)params.N/poolSize/NUM_TASK_FACTOR));
+			for( int i=0; i<poolSize*NUM_TASK_FACTOR && i*blklen<params.N; i++ )
+				tasks.add(new ConvTask(i*blklen, Math.min((i+1)*blklen, params.N), 
+						type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
 			
-			ExecutorService pool = Executors.newFixedThreadPool( poolSize );
-			List<Future<Object>> taskret;
 			try {
-				taskret = pool.invokeAll(tasks);
+				ExecutorService pool = Executors.newFixedThreadPool( poolSize );
+				List<Future<Long>> taskret = pool.invokeAll(tasks);
 				pool.shutdown();
-				for( Future<Object> task : taskret ) {
-					task.get();
-				}
+				for( Future<Long> task : taskret )
+					params.output.nonZeros += task.get();
 				if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
 					for(MatrixBlock partialRetBlock : partialRetBlocks) {
 						elementWiseInPlaceTransposedAddition(params.output, partialRetBlock);
 					}
 				}
-			} catch (InterruptedException e) {
-				throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
-			} catch (ExecutionException e) {
+			} 
+			catch (Exception e) {
 				throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e);
 			}
 		}
 		else {
 			addMatrixBlocks(1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
-			ConvTask task = new ConvTask(0, 0, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
 			try {
-				for(int n = 0; n < params.N; n++) {
-					task.n1 = n;
-					task.n2 = n+1;
-					task.call();
-				}
+				//execute single task and maintain nnz if supported
+				params.output.setNonZeros(new ConvTask(0, params.N, type, params, im2ColOutBlocks, 
+						doutReshapedBlocks, partialRetBlocks).call());
+				
 				if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
 					for(MatrixBlock partialRetBlock : partialRetBlocks) {
 						elementWiseInPlaceTransposedAddition(params.output, partialRetBlock);
@@ -958,92 +981,94 @@ public class LibMatrixDNN {
 	 * to be executed in multi-thread manner.
 	 * 
 	 */
-	private static class ConvTask implements Callable<Object> {
-		public int n1; public int n2; 
-		ConvolutionParameters params;
-		TaskType type;
-		ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks;
-		ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks;
-		ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks;
-		public ConvTask(int n1, int n2, TaskType type, ConvolutionParameters params, 
+	private static class ConvTask implements Callable<Long> 
+	{
+		public int _rl; 
+		public int _ru; 
+		private final ConvolutionParameters _params;
+		private final TaskType _type;
+		private final ConcurrentLinkedQueue<MatrixBlock> _im2ColOutBlocks;
+		private final ConcurrentLinkedQueue<MatrixBlock> _partialRetBlocks;
+		private final ConcurrentLinkedQueue<MatrixBlock> _doutReshapedBlocks;
+		
+		public ConvTask(int rl, int ru, TaskType type, ConvolutionParameters params, 
 				ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks,
 				ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
 				ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
-			this.n1 = n1;
-			this.n2 = n2;
-			this.type = type;
-			this.params = params;
-			this.im2ColOutBlocks = im2ColOutBlocks;
-			this.partialRetBlocks = partialRetBlocks;
-			this.doutReshapedBlocks = doutReshapedBlocks;
+			_rl = rl;
+			_ru = ru;
+			_type = type;
+			_params = params;
+			_im2ColOutBlocks = im2ColOutBlocks;
+			_partialRetBlocks = partialRetBlocks;
+			_doutReshapedBlocks = doutReshapedBlocks;
 		}
 		
 		@Override
-		public Object call() throws DMLRuntimeException {
-			switch(type) {
+		public Long call() throws DMLRuntimeException {
+			long lnnz = 0; //nnz per partition
+			
+			switch(_type) {
 				case MaxPooling_Forward:
-				{
-					for(int n = n1; n < n2; n++) {
-						doPooling(n, params);
-					}
+					for(int n = _rl; n < _ru; n++)
+						doPooling(n, _params);
 					break;
-				}
 				case MaxPooling_Backward:
-					for(int n = n1; n < n2; n++) 
-						doPoolingBackward(n, params);
+					for(int n = _rl; n < _ru; n++) 
+						doPoolingBackward(n, _params);
 					break;
 				case BiasAdd:
-					doBiasAdd(n1, n2, params);
+					doBiasAdd(_params, _rl, _ru);
 					break;
 				case ReluBackward:
-					for(int n = n1; n < n2; n++) 
-						doReluBackward(n, params);
+					lnnz = doReluBackward(_params, _rl, _ru);
 					break;
 				case LoopedIm2ColConv2d:
 				{	
-					MatrixBlock im2ColOutBlock = im2ColOutBlocks.remove();
-					for(int n = n1; n < n2; n++) 
-						doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
-					im2ColOutBlocks.add(im2ColOutBlock);
-					if(params.bias != null)
-						addBias(n1, n2, params);
+					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+					for(int n = _rl; n < _ru; n++) 
+						doLoopedIm2ColConv2d(n, im2ColOutBlock, _params);
+					_im2ColOutBlocks.add(im2ColOutBlock);
+					if(_params.bias != null)
+						addBias(_params, _rl, _ru);
 					break;
 				}
 				case LoopedIm2ColConv2dBwdFilter:
 				{
-					MatrixBlock im2ColOutBlock = im2ColOutBlocks.remove();
-					MatrixBlock partialRetBlock = partialRetBlocks.remove();
-					MatrixBlock doutReshapedBlock = doutReshapedBlocks.remove();
-					for(int n = n1; n < n2; n++) 
-						partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, params);
-					im2ColOutBlocks.add(im2ColOutBlock);
-					partialRetBlocks.add(partialRetBlock);
-					doutReshapedBlocks.add(doutReshapedBlock);
+					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+					MatrixBlock partialRetBlock = _partialRetBlocks.remove();
+					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
+					for(int n = _rl; n < _ru; n++) 
+						partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params);
+					_im2ColOutBlocks.add(im2ColOutBlock);
+					_partialRetBlocks.add(partialRetBlock);
+					_doutReshapedBlocks.add(doutReshapedBlock);
 					break;
 				}
 				case LoopedIm2ColConv2dBwdData:
 				{
-					MatrixBlock doutReshapedBlock = doutReshapedBlocks.remove();
-					for(int n = n1; n < n2; n++) 
-						doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, params);
-					doutReshapedBlocks.add(doutReshapedBlock);
+					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
+					for(int n = _rl; n < _ru; n++) 
+						doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+					_doutReshapedBlocks.add(doutReshapedBlock);
 					break;
 				}
 				default:
-					throw new DMLRuntimeException("Unsupported ConvTask:" + type.name());
+					throw new DMLRuntimeException("Unsupported ConvTask:" + _type.name());
 			}
-			return null;
+			
+			return lnnz;
 		}
 	}
 	
-	private static void addBias(int n1, int n2, ConvolutionParameters params) {
+	private static void addBias(ConvolutionParameters params, int rl, int ru) {
 		int PQ = params.P*params.Q;
 		int K = params.K;
 		double [] outputArr = params.output.getDenseBlock();
 		if(!params.bias.isInSparseFormat()) {
 			double [] biasArr = params.bias.getDenseBlock();
-			int index = n1*K*PQ;
-			for(int n = n1; n < n2; n++) {
+			int index = rl*K*PQ;
+			for(int n = rl; n < ru; n++) {
 				for(int k = 0; k < K; k++) {
 					for(int pq = 0; pq < PQ; pq++, index++) {
 						outputArr[index] += biasArr[k];
@@ -1057,7 +1082,7 @@ public class LibMatrixDNN {
 				IJV ijv = iter.next();
 				int k = ijv.getI();
 				double val = ijv.getV();
-				for(int n = n1; n < n2; n++) {
+				for(int n = rl; n < ru; n++) {
 					int index = n*K*PQ + k*PQ;
 					for(int pq = 0; pq < PQ; pq++, index++) {
 						outputArr[index] += val;