You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/04/30 18:16:57 UTC
[2/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f839990..c794932 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,15 +21,20 @@ package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
 
 public class ConvolutionCPInstruction extends UnaryCPInstruction 
 {	
@@ -131,7 +136,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			return new ConvolutionCPInstruction(in, out, opcode, str, stride,
 					padding, input_shape, filter_shape, k);
 		} 
-		else if (opcode.equalsIgnoreCase("maxpooling_backward")
+		else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
 				|| opcode.equalsIgnoreCase("conv2d")
 				|| opcode.equalsIgnoreCase("conv2d_backward_filter")
 				|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
@@ -288,6 +293,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock);
 	}
 	
+	// Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true
+	// This increases the number of native calls. For example:the cases where filter is sparse but input is dense
+	private boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException {
+		long numElems = filter.getNumRows()*filter.getNumColumns();
+		// if filter is less than 10 MB in dense format (which handles almost all the cases).
+		// In fact, using threshold of 1 MB is still sufficient for common CNNs.
+		if(filter.isInSparseFormat() && numElems < 10e+6)
+			filter.sparseToDense(); 
+		return filter.isInSparseFormat();
+	}
+	
 	
 	@Override
 	public void processInstruction(ExecutionContext ec)
@@ -326,6 +342,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 		int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 		
 		ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
+		params.enableNative = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS) && NativeHelper.isNativeLibraryLoaded();
 		if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
 			if(matBlock.isEmptyBlock()) {
 				outputBlock = new MatrixBlock(N, C*P*Q, true);
@@ -337,14 +354,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 				LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
 			}
 		}
-		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
+		else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
 			MatrixBlock dout = ec.getMatrixInput(_in2.getName());
 			if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) {
 				outputBlock = new MatrixBlock(N, C*H*W, true);
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, C*H*W);
-				LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params);
+				if(instOpcode.equalsIgnoreCase("maxpooling_backward"))
+					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false);
+				else
+					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -355,7 +375,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, K*P*Q);
-				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+				if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+					LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+				else
+					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -367,9 +390,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, K*P*Q);
-				if(!bias.isEmptyBlock())
+				if(!bias.isEmptyBlock()) {
 					params.bias = bias;
-				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+				}
+				if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+					LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+				else
+					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in3.getName());
 			ec.releaseMatrixInput(_in2.getName());
@@ -381,7 +408,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(K, C*R*S);
-				LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+				if(params.enableNative && !matBlock.isInSparseFormat() && !dout.isInSparseFormat())
+					LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+				else
+					LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -392,7 +422,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, C * H * W);
-				LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
+				if(params.enableNative && !isFilterSparse(matBlock) && !dout.isInSparseFormat())
+					LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
+				else
+					LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 497b182..9d4cd1f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -141,7 +141,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
 					padding, input_shape, filter_shape);
 		}
-		else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
+		else if (opcode.equalsIgnoreCase("maxpooling")) {
 			InstructionUtils.checkNumFields(parts, 14);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand out = new CPOperand(parts[14]);
@@ -303,7 +303,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
-		else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
 
 			if(image.getNumRows() != N || image.getNumColumns() != C*H*W) 
@@ -315,9 +315,6 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			if(instOpcode.equalsIgnoreCase("maxpooling"))
 				LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-			else
-				LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
-						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -341,7 +338,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 		// release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 
-		if (!( instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) )
+		if ( !instOpcode.equalsIgnoreCase("maxpooling") )
 			ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
 		if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
index b485233..f687cc0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
@@ -25,6 +25,8 @@ import java.util.Iterator;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.broadcast.Broadcast;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -41,6 +43,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -281,7 +284,8 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 			int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 			
 			ConvolutionParameters params = new ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
-			JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows()), true);
+			boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+			JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
 			
 			//put output RDD handle into symbol table
 			sec.setRDDHandleForVariable(output.getName(), out);
@@ -316,15 +320,16 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 		Broadcast<MatrixBlock> filterBroadcast = null;
 		Broadcast<MatrixBlock> biasBroadcast = null;
 		ConvolutionParameters params = null;
-		String instOpcode = null;
+		String instOpcode = null; boolean enableNative;
 		long numRows = 0;
 		public RDDConv2dMapMMFunction(Broadcast<MatrixBlock> filterBroadcast, 
-				ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows) {
+				ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows, boolean enableNativeBLAS) {
 			this.filterBroadcast = filterBroadcast;
 			this.params = params;
 			this.instOpcode = instOpcode;
 			this.biasBroadcast = biasBroadcast;
 			this.numRows = numRows;
+			this.enableNative = enableNativeBLAS;
 		}
 		
 		private MatrixBlock processRectangularBlock(MatrixBlock matBlock) throws Exception {
@@ -336,7 +341,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 				}
 				else {
 					outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
-					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+					if(enableNative)
+						LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+					else
+						LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 				}
 			}
 			else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -349,7 +357,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 					outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
 					if(!bias.isEmptyBlock())
 						params.bias = bias;
-					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+					if(enableNative)
+						LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+					else
+						LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 				}
 			}
 			else if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 3f0437f..11e74ca 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -34,7 +34,7 @@ public class ConvolutionParameters implements Serializable {
 	public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
 	public int P; public int Q; public int numThreads;
 	
-	
+	public boolean enableNative = false;
 	public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
 	
 	public MatrixBlock bias;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 9ddd87a..56360f8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1043,41 +1043,6 @@ public class LibMatrixCUDA {
 	}
 	
 	/**
-	 * performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...)
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image image as matrix object
-	 * @param outputBlock output matrix
-	 * @param N				batch size
-	 * @param C				number of channels
-	 * @param H				height of image
-	 * @param W				width of image
-	 * @param K				number of filters
-	 * @param R				height of filter
-	 * @param S				width of filter
-	 * @param pad_h			vertical padding
-	 * @param pad_w			horizontal padding
-	 * @param stride_h		horizontal stride
-	 * @param stride_w		vertical stride
-	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
-	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void reluMaxpooling(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
-        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
-		long size  = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE;
-		Pointer tmp = gCtx.allocate(size);
-		performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
-		//cudaDeviceSynchronize; // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
-		performMaxpooling(gCtx, instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		gCtx.cudaFreeHelper(tmp);
-	}
-
-	/**
 	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 	 * This method computes the backpropogation errors for previous layer of maxpooling operation
 	 * @param gCtx   a valid {@link GPUContext}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index cd88b95..6a43917 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -36,6 +36,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 /**
  * This class allows users to invoke deep learning related operations 
@@ -57,10 +59,10 @@ public class LibMatrixDNN {
 	public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output
 
 	private enum TaskType {
-		MaxPooling_Forward, MaxPooling_Backward, 
+		MaxPooling_Forward, MaxPooling_Backward, MaxPooling_Relu_Backward,
 		// Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
 		LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData,
-		BiasAdd, ReluBackward, BiasMultiply
+		ReluBackward
 	}
 	
 	// ------------------------------------------------------------------------------------------------
@@ -141,6 +143,30 @@ public class LibMatrixDNN {
 	// ------------------------------------------------------------------------------------------------
 	
 	/**
+	 * This method performs convolution (i.e. cross-correlation) operation on input
+	 * 
+	 * @param input input batch 
+	 * @param filter filter
+	 * @param outputBlock output of convolution
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+		
+		if(params.bias != null && params.bias.isInSparseFormat())
+			params.bias.sparseToDense(); // Since bias is extremely small array
+		
+		if(isEligibleForConv2dSparse(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
+		
+		runConvTask(TaskType.LoopedIm2ColConv2d, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
+	}
+	
+	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
 	 * 
 	 * @param filter filter used in conv2d 
@@ -150,25 +176,10 @@ public class LibMatrixDNN {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = filter;
-		params.input2 = dout;
-		params.output = outputBlock;
-		if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S || 
-				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
-			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
-		}
-		if(params.stride_h <= 0 || params.stride_w <= 0) {
-			throw new DMLRuntimeException("Only positive strides supported");
-		}
+		checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
 		
-		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
-			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
-				conv2dBwdDataSparseCount.addAndGet(1);
-			}
-			else {
-				conv2dBwdDataDenseCount.addAndGet(1);
-			}
-		}
+		if(isEligibleForConv2dBackwardDataDense(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
 		
@@ -186,17 +197,71 @@ public class LibMatrixDNN {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
+		checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+		
+		if(isEligibleForConv2dBackwardFilterSparseDense(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
+		
+		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
+	}
+	
+	
+	private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException {
+		if(lhs != rhs)
+			throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs);
+	}
+	private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
+		if(lhs != (rhs1*rhs2*rhs3))
+			throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3);
+	}
+	
+	static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  throws DMLRuntimeException {
+		params.input1 = filter;
 		params.input2 = dout;
 		params.output = outputBlock;
-		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || 
-				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
-			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
-		}
-		if(params.stride_h <= 0 || params.stride_w <= 0) {
-			throw new DMLRuntimeException("Only positive strides supported");
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != "
+				+ "number of filters in filter_shape", filter.getNumRows(), params.K);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != "
+				+ "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != "
+				+ "batch size in input_shape", dout.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != "
+				+ "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+		
+		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
+				conv2dBwdDataSparseCount.addAndGet(1);
+			}
+			else {
+				conv2dBwdDataDenseCount.addAndGet(1);
+			}
 		}
 		
+		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+		if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1))
+			params.numThreads = 1;
+	}
+	
+	static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  throws DMLRuntimeException {
+		params.input1 = input;
+		params.input2 = dout;
+		params.output = outputBlock;
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != "
+				+ "batch size in input_shape", input.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != "
+				+ "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != "
+				+ "batch size in input_shape", dout.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != "
+				+ "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			if(input.isInSparseFormat() || dout.isInSparseFormat()) {
 				conv2dBwdFilterSparseCount.addAndGet(1);
@@ -205,11 +270,6 @@ public class LibMatrixDNN {
 				conv2dBwdFilterDenseCount.addAndGet(1);
 			}
 		}
-		
-		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
-		
-		//post-processing: maintain nnz
-		outputBlock.recomputeNonZeros();
 	}
 	
 	/**
@@ -252,11 +312,10 @@ public class LibMatrixDNN {
 		MatrixBlock filter = params.input1;
 		MatrixBlock dout = params.input2;
 		doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
-		dout_reshaped.recomputeNonZeros();
 		
 		MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
-		LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
+		singleThreadedMatMult(dout_reshaped, filter, temp, true, false, params);
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		doCol2imOverSingleImage(n, temp, params);
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
@@ -270,15 +329,13 @@ public class LibMatrixDNN {
 			MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params, double []  tempIm2ColArr) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		doIm2col(n, im2ColOutBlock, params, tempIm2ColArr);
-		im2ColOutBlock.recomputeNonZeros();
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		
 		doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
-		dout_reshaped.recomputeNonZeros();
 		
 		MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
-		LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
+		singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, params);
 		long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
@@ -298,15 +355,21 @@ public class LibMatrixDNN {
 		ret[2] = j % W;
 	}
 	
-	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+	static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = filter;
 		params.output = outputBlock;
 		
-		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || 
-				filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) {
-			throw new DMLRuntimeException("Incorrect input to conv2d: " + input.getNumRows());
-		}
+		checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != "
+				+ "number of filters in filter_shape", filter.getNumRows(), params.K);
+		checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != "
+				+ "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+		checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != "
+				+ "batch size in input_shape", input.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != "
+				+ "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
 		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			if(input.isInSparseFormat() || filter.isInSparseFormat()) {
@@ -316,21 +379,36 @@ public class LibMatrixDNN {
 				conv2dDenseCount.addAndGet(1);
 			}
 		}
-		
-		runConvTask(TaskType.LoopedIm2ColConv2d, params);
-		
-		//post-processing: maintain nnz
-		outputBlock.recomputeNonZeros();
+	}
+	
+	// Single-threaded matrix multiplication
+	private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, 
+			boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException {
+		if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) {
+			if(recomputeNNZM1)
+				m1.recomputeNonZeros();
+			if(recomputeNNZM2)
+				m2.recomputeNonZeros();
+			LibMatrixMult.matrixMult(m1, m2, ret, false);
+		}
+		else {
+			ret.sparse = false;
+			if(ret.getDenseBlock() == null)
+				ret.allocateDenseBlock();
+			NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
+					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1);
+			ret.recomputeNonZeros();
+		}
 	}
 	
 	private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params, double []  temp) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		doIm2col(n, im2ColOutBlock, params, temp);
-		im2ColOutBlock.recomputeNonZeros();
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		
 		MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
-		LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
+		singleThreadedMatMult(params.input2, im2ColOutBlock, matMultOutBlock, false, true, params);
+		
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
@@ -380,9 +458,11 @@ public class LibMatrixDNN {
 	 * @param dout dout matrix
 	 * @param outputBlock output matrix
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU backward
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+	public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, 
+			ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
 		params.output = outputBlock;
@@ -407,7 +487,10 @@ public class LibMatrixDNN {
 			throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");
 
 		fillIndexesArray(params);
-		runConvTask(TaskType.MaxPooling_Backward, params);
+		if(performReluBackward)
+			runConvTask(TaskType.MaxPooling_Relu_Backward, params);
+		else
+			runConvTask(TaskType.MaxPooling_Backward, params);
 		
 		//post-processing: maintain nnz 
 		outputBlock.recomputeNonZeros();
@@ -440,7 +523,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackward(int n, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())
 			inputArray = params.input1.getDenseBlock();
@@ -455,19 +538,19 @@ public class LibMatrixDNN {
 			
 		if(inputArray != null) {
 			if(doutArray != null)
-				doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
+				doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params, performReluBackward);
 			else
-				doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
+				doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params, performReluBackward);
 		}
 		else {
 			if(doutArray != null)
-				doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
+				doPoolingBackwardSparseDense(n, doutArray, outputArray, params, performReluBackward);
 			else
-				doPoolingBackwardSparseSparse(n, outputArray, params);
+				doPoolingBackwardSparseSparse(n, outputArray, params, performReluBackward);
 		}
 	}
 	
-	private static void doPoolingBackwardSparseDense(int n, double [] doutArray,  double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackwardSparseDense(int n, double [] doutArray,  double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
 		
@@ -477,7 +560,7 @@ public class LibMatrixDNN {
 					double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + q];
 					if(inVal != 0) {
 						final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-						int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+						int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
 						if(maxIndex != -1)
 							outputArray[maxIndex] += inVal;
 					}
@@ -486,7 +569,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
 		
@@ -502,7 +585,7 @@ public class LibMatrixDNN {
 				int p = tensorIndexes[1];
 				int q = tensorIndexes[2];
 				final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-				int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+				int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
 				if(maxIndex != -1)
 					outputArray[maxIndex] += avals[j];
 			}
@@ -510,7 +593,7 @@ public class LibMatrixDNN {
 	}
 	
 	private static void doPoolingBackwardDenseSparse(int n, double [] inputArray, 
-			MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+			MatrixBlock dout, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if( !dout.sparseBlock.isEmpty(n) ) {
 			int [] tensorIndexes = new int[3];
 			int apos = dout.sparseBlock.pos(n);
@@ -523,7 +606,7 @@ public class LibMatrixDNN {
 				int p = tensorIndexes[1];
 				int q = tensorIndexes[2];
 				final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-				int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+				int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
 				if(maxIndex != -1)
 					outputArray[maxIndex] += avals[j];
 			}
@@ -531,14 +614,14 @@ public class LibMatrixDNN {
 	}
 	
 	private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray, 
-			double [] outputArray, ConvolutionParameters params) {
+			double [] outputArray, ConvolutionParameters params, boolean performReluBackward) {
 		for (int c = 0; c < params.C; c++) {
 			final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 			final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q;
 			
 			for (int p = 0; p < params.P; p++) {
 				for (int q = 0; q < params.Q; q++) {
-					int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+					int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
 					if(maxIndex != -1)
 						outputArray[maxIndex] += doutArray[outputOffset +  p * params.Q + q];
 				}
@@ -556,10 +639,11 @@ public class LibMatrixDNN {
 	 * @param c number of channels 
 	 * @param input input matrix
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU on input
 	 * @return index of the cell with maximum value
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
+	private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if(!input.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");
 		
@@ -591,9 +675,10 @@ public class LibMatrixDNN {
 				int h = tensorIndexes[1];
 				int w = tensorIndexes[2];
 				if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
-					if(maxVal < avals[j]) {
+					double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; 
+					if(maxVal < val) {
 						maxIndex = inputOffset +  h*params.W + w;
-						maxVal = avals[j];
+						maxVal = val;
 					}
 				}
 			}
@@ -612,9 +697,10 @@ public class LibMatrixDNN {
 	 * @param inputOffset offset to be used for input index
 	 * @param inputArray input array
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU backward
 	 * @return index of cell with maximum value
 	 */
-	private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
+	private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) {
 		int start_index_h = params.start_indexes_h[p];
 		int end_index_h = params.end_indexes_h[p];
 		int start_index_w = params.start_indexes_w[q];
@@ -632,6 +718,7 @@ public class LibMatrixDNN {
 		for (int h = start_index_h; h < end_index_h; h++) {
 			for (int w = start_index_w; w < end_index_w; w++) {
 				currDoutVal = inputArray[inputOffset +  h*params.W + w];
+				currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal;
 				if(maxVal < currDoutVal) {
 					maxIndex = inputOffset +  h*params.W + w;
 					maxVal = currDoutVal;
@@ -894,11 +981,15 @@ public class LibMatrixDNN {
 	private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params, 
 			ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
 			ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
+		boolean isEligibleForConv2dSparse = (type == TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params);
+		boolean isEligibleForConv2dBackwardFilterSparseDense = (type == TaskType.LoopedIm2ColConv2dBwdFilter) && isEligibleForConv2dBackwardFilterSparseDense(params) ;
 		for(int i = 0; i < poolSize; i++) {
 			if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
-				MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-				im2ColOutBlock.allocateDenseBlock();
-				im2ColOutBlocks.add(im2ColOutBlock);
+				if(!isEligibleForConv2dSparse && !isEligibleForConv2dBackwardFilterSparseDense) {
+					MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+					im2ColOutBlock.allocateDenseBlock();
+					im2ColOutBlocks.add(im2ColOutBlock);
+				}
 			}
 			
 			if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
@@ -962,6 +1053,21 @@ public class LibMatrixDNN {
 	}
 	// ----------------------------------------------------------------------------------------------------------------
 	
+	private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
+		// NativeHelper.conv2dBackwardFilterSparseDense only if filter is sparse. 
+		// dout converted to dense if sparse.
+		return params.enableNative && params.input1.isInSparseFormat();
+	}
+	private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) {
+		// NativeHelper.conv2dSparse only if filter is dense and input is sparse
+		return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat();
+	}
+	private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
+		// NativeHelper.conv2dBackwardDataDense only if filter is dense. 
+		// dout converted to dense if sparse.
+		return params.enableNative && !params.input1.isInSparseFormat();
+	}
+	
 	/**
 	 * The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc)
 	 * to be executed in multi-thread manner.
@@ -1001,55 +1107,108 @@ public class LibMatrixDNN {
 					break;
 				case MaxPooling_Backward:
 					for(int n = _rl; n < _ru; n++) 
-						doPoolingBackward(n, _params);
-					break;
-				case BiasAdd:
-				{
-					double [] dest = _params.output.getDenseBlock();
-					ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, 
-							_rl, _ru, _binaryElementWiseAddition);
+						doPoolingBackward(n, _params, false);
 					break;
-				}
-				case BiasMultiply:
-				{
-					double [] dest = _params.output.getDenseBlock();
-					ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, 
-							_rl, _ru, _binaryElementWiseMultiplication);
+				case MaxPooling_Relu_Backward:
+					for(int n = _rl; n < _ru; n++) 
+						doPoolingBackward(n, _params, true);
 					break;
-				}
 				case ReluBackward:
 					lnnz = doReluBackward(_params, _rl, _ru);
 					break;
 				case LoopedIm2ColConv2d:
 				{	
-					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
-					double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
-					for(int n = _rl; n < _ru; n++) 
-						doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
-					_im2ColOutBlocks.add(im2ColOutBlock);
-					if(_params.bias != null)
-						ConvolutionUtils.binaryBiasOperationInPlace(_params.bias, _params.output.getDenseBlock(), _params.K, 
-								_params.P*_params.Q, _rl, _ru, _binaryElementWiseAddition);
+					if(isEligibleForConv2dSparse(_params)) {
+						// NativeHelper.conv2dSparse only if filter is dense and input is sparse
+						int KPQ = _params.K*_params.P*_params.Q;
+						double[] temp = new double[KPQ];
+						for(int n = _rl; n < _ru; n++)  {
+							if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+								int apos = _params.input1.getSparseBlock().pos(n);
+								int alen = _params.input1.getSparseBlock().size(n);
+								int[] aix = _params.input1.getSparseBlock().indexes(n);
+								double[] avals = _params.input1.getSparseBlock().values(n);
+								NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp, 
+										1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, 
+										_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+								System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ);
+							}
+						}
+					}
+					else {
+						// In all other cases, perform im2col in Java + matmult (either native or java).
+						MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+						double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+						for(int n = _rl; n < _ru; n++) 
+							doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
+						_im2ColOutBlocks.add(im2ColOutBlock);
+					}
+					if(_params.bias != null) {
+						// bias is always converted to dense format
+						double [] biasArr = _params.bias.getDenseBlock();
+						int PQ = _params.P*_params.Q;
+						int index = _rl*_params.K*PQ;
+						for(int n = _rl; n < _ru; n++) {
+							for(int k = 0; k < _params.K; k++) {
+								for(int pq = 0; pq < PQ; pq++, index++) {
+									_params.output.denseBlock[index] += biasArr[k];
+								}
+							}
+						}
+					}
 					break;
 				}
 				case LoopedIm2ColConv2dBwdFilter:
 				{
-					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
 					MatrixBlock partialRetBlock = _partialRetBlocks.remove();
 					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
-					double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
-					for(int n = _rl; n < _ru; n++) 
-						partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
-					_im2ColOutBlocks.add(im2ColOutBlock);
-					_partialRetBlocks.add(partialRetBlock);
+					if(isEligibleForConv2dBackwardFilterSparseDense(_params)) {
+						double [] dout_n = doutReshapedBlock.getDenseBlock();
+						for(int n = _rl; n < _ru; n++) {
+							if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+								doRotate180(n, 0, _params.input2, dout_n, _params, true);
+								int apos = _params.input1.getSparseBlock().pos(n);
+								int alen = _params.input1.getSparseBlock().size(n);
+								int[] aix = _params.input1.getSparseBlock().indexes(n);
+								double[] avals = _params.input1.getSparseBlock().values(n);
+								NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
+										dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, _params.K, 
+										_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+							}
+						}
+					}
+					else {
+						MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+						double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+						for(int n = _rl; n < _ru; n++) 
+							partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
+						_im2ColOutBlocks.add(im2ColOutBlock);
+					}
 					_doutReshapedBlocks.add(doutReshapedBlock);
+					_partialRetBlocks.add(partialRetBlock);
 					break;
 				}
 				case LoopedIm2ColConv2dBwdData:
 				{
 					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
-					for(int n = _rl; n < _ru; n++) 
-						doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+					if(isEligibleForConv2dBackwardDataDense(_params)) {
+						int CHW = _params.C*_params.H*_params.W;
+						double [] ret = new double[CHW];
+						double [] filterArr = _params.input1.getDenseBlock();
+						for(int n = _rl; n < _ru; n++) {
+							double [] dout_n = getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock());
+							if(n > _rl)
+								Arrays.fill(ret, 0);
+							NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, 
+									_params.C, _params.H, _params.W, _params.K, 
+									_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+							System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW);
+						}
+					}
+					else {
+						for(int n = _rl; n < _ru; n++) 
+							doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+					}
 					_doutReshapedBlocks.add(doutReshapedBlock);
 					break;
 				}
@@ -1192,16 +1351,24 @@ public class LibMatrixDNN {
 	}
 	
 	// Returns the row of matrix in dense format
-	private static double [] getRowInDenseFormat(MatrixBlock input, int n, double []  temp) {
+	private static double [] getRowInDenseFormat(MatrixBlock input, int n, double []  temp) throws DMLRuntimeException {
+		if(input.getNumColumns() != temp.length) {
+			throw new DMLRuntimeException("Invalid parameters");
+		}
 		// Use temporary array to avoid binary search
-		Arrays.fill(temp, 0);
-		if( !input.sparseBlock.isEmpty(n) ) {
-			int apos = input.sparseBlock.pos(n);
-			int alen = input.sparseBlock.size(n);
-			int[] aix = input.sparseBlock.indexes(n);
-			double[] avals = input.sparseBlock.values(n);
-			for(int j=apos; j<apos+alen; j++)
-				temp[ aix[j] ] = avals[j];
+		if(input.isInSparseFormat()) {
+			Arrays.fill(temp, 0);
+			if( !input.sparseBlock.isEmpty(n) ) {
+				int apos = input.sparseBlock.pos(n);
+				int alen = input.sparseBlock.size(n);
+				int[] aix = input.sparseBlock.indexes(n);
+				double[] avals = input.sparseBlock.values(n);
+				for(int j=apos; j<apos+alen; j++)
+					temp[ aix[j] ] = avals[j];
+			}
+		}
+		else {
+			System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns());
 		}
 		return temp;
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 5c7c2ef..983ce53 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -17,7 +17,6 @@
  * under the License.
  */
 
-
 package org.apache.sysml.runtime.matrix.data;
 
 import java.util.ArrayList;
@@ -123,7 +122,7 @@ public class LibMatrixMult
 	{
 		matrixMult(m1, m2, ret, rl, ru, true);
 	}
-
+	
 	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) 
 		throws DMLRuntimeException
 	{	
@@ -132,9 +131,9 @@ public class LibMatrixMult
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-		
+			
 		//Timing time = new Timing(true);
-		
+			
 		//pre-processing: output allocation
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
@@ -166,6 +165,7 @@ public class LibMatrixMult
 		if(examSparsity)
 			ret.examSparsity();
 		
+		
 		//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
 		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}
@@ -188,7 +188,7 @@ public class LibMatrixMult
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-		
+			
 		//check too high additional vector-matrix memory requirements (fallback to sequential)
 		//check too small workload in terms of flops (fallback to sequential too)
 		if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse()) 
@@ -247,6 +247,7 @@ public class LibMatrixMult
 			throw new DMLRuntimeException(ex);
 		}
 		
+		
 		//post-processing (nnz maintained in parallel)
 		ret.examSparsity();
 		
@@ -3960,4 +3961,4 @@ public class LibMatrixMult
 			return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
 		}
 	}
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
new file mode 100644
index 0000000..4b12596
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixNative {
+	
+	// We could encapsulate heuristics in this function
+	// For now, we only consider matrix-vector operation to be memory bound
+	private static boolean isMatMultMemoryBound(int m1Rlen, int m1Clen, int m2Clen) {
+		return m1Rlen == 1 || m1Clen == 1 || m2Clen == 1;
+	}
+
+	/**
+	 * Performs matrix multiplication using native library if BLAS is available or else falls back to
+	 * Java BLAS.
+	 * 
+	 * @param m1 lhs matrix block
+	 * @param m2 rhs matrix block
+	 * @param ret output matrix block
+	 * @param k number of threads
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException {
+		matrixMult(m1, m2, ret, k, true);
+	}
+	
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, boolean examSparsity) throws DMLRuntimeException {
+		// Sanity check:
+		k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
+		
+		// check inputs / outputs
+		if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+			ret.setNonZeros(0);
+			if(examSparsity)
+				ret.examSparsity(); // turn empty dense into sparse
+			return;
+		}
+		if (NativeHelper.isNativeLibraryLoaded() && 
+				!isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+			ret.sparse = false;
+			ret.allocateDenseBlock();
+			if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
+					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
+				Statistics.numNativeLibMatrixMultCalls.increment();
+				ret.recomputeNonZeros();
+				// post-processing (nnz maintained in parallel)
+				if(examSparsity)
+					ret.examSparsity();
+				return;
+			} else {
+				// Else fall back to Java
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		if (k == 1)
+			LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
+		else
+			LibMatrixMult.matrixMult(m1, m2, ret, k);
+	}
+	
+	/**
+	 * This method performs convolution (i.e. cross-correlation) operation on input
+	 * 
+	 * @param input input batch 
+	 * @param filter filter
+	 * @param outputBlock output of convolution
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
+			setNumThreads(params);
+			if(params.bias == null) {
+				if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+					Statistics.numNativeLibMatrixDNNCalls.increment();
+					// post-processing: maintain nnz
+					outputBlock.recomputeNonZeros();
+					return;
+				}
+				else {
+					// Fall back to Java when failures
+					Statistics.incrementNativeFailuresCounter();
+				}
+			}
+			else {
+				if(params.bias.isInSparseFormat())
+					params.bias.sparseToDense(); // Bias matrix is usually extremely small
+				if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, 
+						params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+					Statistics.numNativeLibMatrixDNNCalls.increment();
+					// post-processing: maintain nnz
+					outputBlock.recomputeNonZeros();
+					return;
+				}
+				else {
+					// Fall back to Java when failures
+					Statistics.incrementNativeFailuresCounter();
+				}
+			}
+		}
+		
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2d(input, filter, outputBlock, params);
+	}
+	
+	private static void setNumThreads(ConvolutionParameters params) {
+		params.numThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+		if (!(params.isOutputThreadSafe() && params.numThreads > 1))
+			params.numThreads = 1;
+	}
+	
+	/**
+	 * This method computes the backpropogation errors for filter of convolution operation
+	 * 
+	 * @param input input image 
+	 * @param dout errors from next layer
+	 * @param outputBlock  output errors
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
+			setNumThreads(params);
+			if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+				Statistics.numNativeLibMatrixDNNCalls.increment();
+				// post-processing: maintain nnz
+				outputBlock.recomputeNonZeros();
+				return;
+			}
+			else {
+				// Fall back to Java when failures
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2dBackwardFilter(input, dout, outputBlock, params);
+	}
+	
+	/**
+	 * This method computes the backpropogation errors for previous layer of convolution operation
+	 * 
+	 * @param filter filter used in conv2d 
+	 * @param dout errors from next layer
+	 * @param outputBlock  output errors
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
+			setNumThreads(params);
+			if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+				Statistics.numNativeLibMatrixDNNCalls.increment();
+				// post-processing: maintain nnz
+				outputBlock.recomputeNonZeros();
+				return;
+			}
+			else {
+				// Fall back to Java when failures
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index c458b07..8fac947 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -35,6 +35,7 @@ import java.util.stream.LongStream;
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.MMTSJ.MMTSJType;
@@ -4873,15 +4874,26 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		
 		return sum_wt;
 	}
+	
+	public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
+			MatrixValue result, AggregateBinaryOperator op ) throws DMLRuntimeException
+	{
+		boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+	}
 
 	public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
-			MatrixValue result, AggregateBinaryOperator op ) 
-		throws DMLRuntimeException
+			MatrixValue result, AggregateBinaryOperator op, boolean enableNativeBLAS ) throws DMLRuntimeException
 	{
-		return aggregateBinaryOperations(m1Value, m2Value, result, op);
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+	}
+	
+	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) throws DMLRuntimeException {
+		boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
 	}
 
-	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) 
+	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op, boolean nativeMatMult) 
 		throws DMLRuntimeException
 	{
 		//check input types, dimensions, configuration
@@ -4907,7 +4919,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros);
 		
 		//compute matrix multiplication (only supported binary aggregate operation)
-		if( op.getNumThreads() > 1 )
+		if( nativeMatMult )
+			LibMatrixNative.matrixMult(m1, m2, ret, op.getNumThreads());
+		else if( op.getNumThreads() > 1 )
 			LibMatrixMult.matrixMult(m1, m2, ret, op.getNumThreads());
 		else
 			LibMatrixMult.matrixMult(m1, m2, ret);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index b988546..8e51405 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.util;
 import java.util.Arrays;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Plus;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
@@ -58,12 +60,20 @@ public class ConvolutionUtils {
 		}
 		return ret;
 	}
+
 	
-	// Performs dest[destPos ...] <- src[src_rl:src_ru, ]
-	//Assumes that dest is zeroed-out before calling
-	public static void copy(MatrixBlock src, double [] dest, int destPos, int destNumCols, int src_rl, int src_ru) {
+	// Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
+	public static void binaryOperationInPlace(MatrixBlock src, double [] dest, 
+			int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
 		if(src.isInSparseFormat()) {
-			if(!src.isEmptyBlock()) {
+			if(src.isEmptyBlock() && op.fn == Plus.getPlusFnObject()) {
+				// Do nothing: Inplace addition by zero
+			}
+			else if(src.isEmptyBlock() && op.fn == Multiply.getMultiplyFnObject()) {
+				// Inplace multiplication by zero
+				Arrays.fill(dest, destPos, destPos + (src_ru-src_rl)*destNumCols, 0);
+			}
+			else if(op.fn == Plus.getPlusFnObject()) {
 				for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
 					if( !src.getSparseBlock().isEmpty(i) ) {
 						int apos = src.getSparseBlock().pos(i);
@@ -71,37 +81,54 @@ public class ConvolutionUtils {
 						int[] aix = src.getSparseBlock().indexes(i);
 						double[] avals = src.getSparseBlock().values(i);
 						for(int j = apos; j < apos+alen; j++) {
-							dest[ cix+aix[j] ] = avals[j];
+							dest[ cix+aix[j] ] += avals[j];
 						}
 					}
 				}
 			}
-		}
-		else {
-			System.arraycopy(src.getDenseBlock(), src_rl*src.getNumColumns(), dest, destPos, (src_ru-src_rl)*src.getNumColumns());
-		}
-	}
-	
-	// Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
-	public static void binaryOperationInPlace(MatrixBlock src, double [] dest, 
-			int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
-		if(src.isInSparseFormat()) {
-			for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
-				if( !src.getSparseBlock().isEmpty(i) ) {
-					int apos = src.getSparseBlock().pos(i);
-					int alen = src.getSparseBlock().size(i);
-					int[] aix = src.getSparseBlock().indexes(i);
-					double[] avals = src.getSparseBlock().values(i);
-					for(int j = apos; j < apos+alen; j++) {
-						dest[ cix+aix[j] ] = op.fn.execute(dest[ cix+aix[j] ], avals[j]);
+			else if(op.fn == Multiply.getMultiplyFnObject()) {
+				// Unsafe operation
+				for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
+					if( !src.getSparseBlock().isEmpty(i) ) {
+						int apos = src.getSparseBlock().pos(i);
+						int alen = src.getSparseBlock().size(i);
+						int[] aix = src.getSparseBlock().indexes(i);
+						double[] avals = src.getSparseBlock().values(i);
+						int prevDestIndex = 0;
+						for(int j = apos; j < apos+alen; j++) {
+							// Multiplication by zero. Assumption: aix is sorted.
+							Arrays.fill(dest, cix+prevDestIndex, aix[j], 0);
+							prevDestIndex = aix[j]+1;
+							dest[ cix+aix[j] ] *= avals[j];
+						}
+						Arrays.fill(dest, cix+prevDestIndex, cix+destNumCols, 0);
+					}
+					else {
+						Arrays.fill(dest, cix, cix + destNumCols, 0);
 					}
 				}
 			}
+			else {
+				// As operation could be safe or unsafe. This will be caught at development time.
+				throw new DMLRuntimeException("Unimplemented sparse operation");
+			}
 		}
 		else {
 			double [] inputArr = src.getDenseBlock();
-			for(int i = destPos; i < src_ru*destNumCols; i++) {
-				dest[i] = op.fn.execute(dest[i], inputArr[i]);
+			if(op.fn == Plus.getPlusFnObject()) {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] += inputArr[i];
+				}
+			}
+			else if(op.fn == Multiply.getMultiplyFnObject()) {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] *= inputArr[i];
+				}
+			}
+			else {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] = op.fn.execute(dest[i], inputArr[i]);
+				}
 			}
 		}
 	}
@@ -130,45 +157,6 @@ public class ConvolutionUtils {
 		}
 	}
 	
-	// dest (of size N x KPQ) = input (of size N x KPQ) op bias (of size K x 1)
-	public static void binaryBiasOperations(MatrixBlock input, MatrixBlock bias, double [] dest, 
-			int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
-		copy(input, dest, rl*K*PQ, K*PQ, rl, ru);
-		binaryBiasOperationInPlace(bias, dest, K, PQ, rl, ru, op);
-	}
-	
-	// dest (of size N x KPQ) op= bias (of size K x 1)
-	public static void binaryBiasOperationInPlace(MatrixBlock bias, double [] dest, 
-			int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
-		// bias.getNumColumns() == 1 checked outside
-		if(!bias.isInSparseFormat()) {
-			double [] biasArr = bias.getDenseBlock();
-			int index = rl*K*PQ;
-			for(int n = rl; n < ru; n++) {
-				for(int k = 0; k < K; k++) {
-					for(int pq = 0; pq < PQ; pq++, index++) {
-						dest[index] = op.fn.execute(dest[index], biasArr[k]);
-					}
-				}
-			}
-		}
-		else {
-			for(int k = 0; k < K; k++) {
-				if( !bias.getSparseBlock().isEmpty(k) ) {
-					int apos = bias.getSparseBlock().pos(k);
-					double[] avals = bias.getSparseBlock().values(k);
-					double val = avals[apos];
-					for(int n = rl; n < ru; n++) {
-						int index = n*K*PQ + k*PQ;
-						for(int pq = 0; pq < PQ; pq++, index++) {
-							dest[index] = op.fn.execute(dest[index], val);
-						}
-					}
-				}
-			}
-		}
-	}
-	
 	public static void fillBias(MatrixBlock bias, double [] outputArray, int src_rl, int src_ru, int N, int K, int PQ) throws DMLRuntimeException {
 		// bias.getNumColumns() == 1 checked outside
 		if(bias.isInSparseFormat()) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
new file mode 100644
index 0000000..c1b4c3e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.utils;
+
+
+/**
+ * This class is useful in setting environment variable for loading MKL library (done by Native Helper)
+ */
+public class EnvironmentHelper {
+	public static native void setEnv(String key, String value);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
new file mode 100644
index 0000000..2b997ed
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.utils;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.HashMap;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.SystemUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
+
+/**
+ * This class helps in loading native library.
+ * By default, it first tries to load Intel MKL, else tries to load OpenBLAS.
+ */
+public class NativeHelper {
+	private static boolean isSystemMLLoaded = false;
+	private static final Log LOG = LogFactory.getLog(NativeHelper.class.getName());
+	private static HashMap<String, String> supportedArchitectures = new HashMap<String, String>();
+	public static String blasType;
+	private static int maxNumThreads = -1;
+	private static boolean setMaxNumThreads = false;
+	static {
+		// Note: we only support 64 bit Java on x86 and AMD machine
+    supportedArchitectures.put("x86_64", "x86_64");
+    supportedArchitectures.put("amd64", "x86_64");
+	}
+	
+	private static boolean attemptedLoading = false;
+	
+	// Performing loading in a method instead of a static block will throw a detailed stack trace in case of fatal errors
+	private static void init() {
+		// Only Linux supported for BLAS
+		if(!SystemUtils.IS_OS_LINUX)
+			return;
+		
+		// attemptedLoading variable ensures that we don't try to load SystemML and other dependencies 
+		// again and again especially in the parfor (hence the double-checking with synchronized).
+		if(!attemptedLoading) {
+			DMLConfig dmlConfig = ConfigurationManager.getDMLConfig();
+			String userSpecifiedBLAS = System.getenv("SYSTEMML_BLAS");
+			userSpecifiedBLAS = (userSpecifiedBLAS == null) ? "" :  userSpecifiedBLAS.trim().toLowerCase();
+			// -------------------------------------------------------------------------------------
+			// We allow BLAS to be enabled or disabled or explicitly selected in one of the two ways:
+			// 1. DML Configuration: native.blas (boolean flag)
+			// 2. Environment variable: SYSTEMML_BLAS (can be set to mkl, openblas or none)
+			// The option 1 will be removed in later SystemML versions.
+			// The option 2 is useful for two reasons:
+			// - Developer testing of different BLAS 
+			// - Provides fine-grained control. Certain machines could use mkl while others use openblas, etc.
+			boolean enabledViaConfig = (dmlConfig == null) ? true : dmlConfig.getBooleanValue(DMLConfig.NATIVE_BLAS);
+			boolean enabledViaEnvironmentVariable = userSpecifiedBLAS.equals("") || userSpecifiedBLAS.equals("mkl") || userSpecifiedBLAS.equals("openblas");
+			
+			if(enabledViaConfig && enabledViaEnvironmentVariable) {
+				long start = System.nanoTime();
+				if(!supportedArchitectures.containsKey(SystemUtils.OS_ARCH)) {
+					LOG.warn("Unsupported architecture for native BLAS:" + SystemUtils.OS_ARCH);
+					return;
+				}
+	    	synchronized(NativeHelper.class) {
+	    		if(!attemptedLoading) {
+	    			// -----------------------------------------------------------------------------
+	    			// =============================================================================
+	    			// By default, we will native.blas=true and we will attempt to load MKL first.
+    				// If MKL is not enabled then we try to load OpenBLAS.
+    				// If both MKL and OpenBLAS are not available we fall back to Java BLAS.
+	    			if(userSpecifiedBLAS.equalsIgnoreCase("")) {
+	    				blasType = isMKLAvailable() ? "mkl" : isOpenBLASAvailable() ? "openblas" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load either MKL or OpenBLAS");
+	    			}
+	    			else if(userSpecifiedBLAS.equalsIgnoreCase("mkl")) {
+	    				blasType = isMKLAvailable() ? "mkl" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load MKL");
+	    			}
+	    			else if(userSpecifiedBLAS.equalsIgnoreCase("openblas")) {
+	    				blasType = isOpenBLASAvailable() ? "openblas" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load OpenBLAS");
+	    			}
+	    			else {
+	    				LOG.warn("Unsupported BLAS:" + userSpecifiedBLAS);
+	    			}
+	    			// =============================================================================
+				    if(blasType != null && loadLibraryHelper("libsystemml_" + blasType + "-Linux-x86_64.so")) {
+							LOG.info("Using native blas: " + blasType);
+							isSystemMLLoaded = true;
+						}
+	    		}
+	    	}
+	    	double timeToLoadInMilliseconds = (System.nanoTime()-start)*1e-6;
+	    	if(timeToLoadInMilliseconds > 100) 
+	    		LOG.warn("Time to load native blas: " + timeToLoadInMilliseconds + " milliseconds.");
+			}
+			else {
+				if(enabledViaConfig)
+					LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the configuration 'native.blas'.");
+				else
+					LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the environment variable 'SYSTEMML_BLAS=" + userSpecifiedBLAS + "'.");
+			}
+			attemptedLoading = true;
+		}
+	}
+	
+	public static boolean isNativeLibraryLoaded() {
+		init();
+		if(maxNumThreads == -1)
+			maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+		if(isSystemMLLoaded && !setMaxNumThreads && maxNumThreads != -1) {
+			// This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+			// In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. 
+			// We can revisit this decision later and hence I would not recommend removing this method. 
+			setMaxNumThreads(maxNumThreads);
+			setMaxNumThreads = true;
+		}
+		return isSystemMLLoaded;
+	}
+	
+	public static int getMaxNumThreads() {
+		if(maxNumThreads == -1)
+			maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+		return maxNumThreads;
+	}
+	
+	
+	private static boolean isMKLAvailable() {
+		// ------------------------------------------------------------
+		// Set environment variable MKL_THREADING_LAYER to GNU on Linux for performance
+		if(!loadLibraryHelper("libpreload_systemml-Linux-x86_64.so")) {
+			LOG.warn("Unable to load preload_systemml (required for loading MKL-enabled SystemML library)");
+			return false;
+		}
+		// The most reliable way in my investigation to ensure that MKL runs smoothly with OpenMP (used by conv2d*)
+		// is setting the environment variable MKL_THREADING_LAYER to GNU
+		EnvironmentHelper.setEnv("MKL_THREADING_LAYER", "GNU");
+		if(!loadBLAS("gomp", "gomp required for loading MKL-enabled SystemML library")) 
+			return false;
+		
+		// ------------------------------------------------------------
+		return loadBLAS("mkl_rt", null);
+	}
+	
+	private static boolean isOpenBLASAvailable() {
+		if(!loadBLAS("gomp", "gomp required for loading OpenBLAS-enabled SystemML library")) 
+			return false;
+		return loadBLAS("openblas", null);
+	}
+	
+	private static boolean loadBLAS(String blas, String optionalMsg) {
+		try {
+			 System.loadLibrary(blas);
+			 return true;
+		}
+		catch (UnsatisfiedLinkError e) {
+			if(optionalMsg != null)
+				LOG.warn("Unable to load " + blas + "(" + optionalMsg + "):" + e.getMessage());
+			else
+				LOG.warn("Unable to load " + blas + ":" + e.getMessage());
+			return false;
+		}
+	}
+
+	private static boolean loadLibraryHelper(String path)  {
+		InputStream in = null; OutputStream out = null;
+		try {
+			// This logic is added because Java doesnot allow to load library from a resource file.
+			in = NativeHelper.class.getResourceAsStream("/lib/"+path);
+			if(in != null) {
+				File temp = File.createTempFile(path, "");
+				temp.deleteOnExit();
+				out = FileUtils.openOutputStream(temp);
+        IOUtils.copy(in, out);
+        in.close(); in = null;
+        out.close(); out = null;
+				System.load(temp.getAbsolutePath());
+				return true;
+			}
+			else
+				LOG.warn("No lib available in the jar:" + path);
+		} catch(IOException e) {
+			LOG.warn("Unable to load library " + path + " from resource:" + e.getMessage());
+		} finally {
+			if(out != null)
+				try {
+					out.close();
+				} catch (IOException e) {}
+			if(in != null)
+				try {
+					in.close();
+				} catch (IOException e) {}
+		}
+		return false;
+	}
+	
+	// TODO: Add pmm, wsloss, mmchain, etc.
+	public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+	private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
+	
+	// ----------------------------------------------------------------------------------------------------------------
+	// LibMatrixDNN operations:
+	// N = number of images, C = number of channels, H = image height, W = image width
+	// K = number of filters, R = filter height, S = filter width
+	// TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority.
+	
+	// Called by ConvolutionCPInstruction if both input and filter are dense
+	public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// Called by ConvolutionCPInstruction if both input and filter are dense
+	public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// If both filter and dout are dense, then called by ConvolutionCPInstruction
+	// Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
+	public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	
+	// Currently only supported with numThreads = 1 and sparse input
+	// Called by LibMatrixDNN's thread if input is sparse. dout[n] is converted to dense if sparse.
+	public static native boolean conv2dBackwardFilterSparseDense(int apos, int alen, int[] aix, double[] avals, double [] rotatedDoutPtr, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// Called by LibMatrixDNN's thread if input is sparse and filter is dense
+	public static native boolean conv2dSparse(int apos, int alen, int[] aix, double[] avals, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// ----------------------------------------------------------------------------------------------------------------
+	
+	// This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+	// In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. 
+	// We can revisit this decision later and hence I would not recommend removing this method. 
+	private static native void setMaxNumThreads(int numThreads);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java
index 097f58e..97888cb 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -112,6 +112,17 @@ public class Statistics
 		return numExecutedMRJobs.longValue();
 	}
 	
+	private static LongAdder numNativeFailures = new LongAdder();
+	public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
+	public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+	public static void incrementNativeFailuresCounter() {
+		numNativeFailures.increment();
+		// This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case.
+		// Note: all the native calls have a fallback to Java; so if the user wants she can recompile SystemML by 
+		// commenting this exception and everything should work fine.
+		throw new RuntimeException("Unexpected ERROR: OOM caused during JNI transfer. Please disable native BLAS by setting enviroment variable: SYSTEMML_BLAS=none");
+	}
+	
 	public static void incrementNoOfExecutedMRJobs() {
 		numExecutedMRJobs.increment();
 	}
@@ -366,6 +377,9 @@ public class Statistics
 		resetCPHeavyHitters();
 
 		GPUStatistics.reset();
+		numNativeLibMatrixMultCalls.reset();
+		numNativeLibMatrixDNNCalls.reset();
+		numNativeFailures.reset();
 		LibMatrixDNN.resetStatistics();
 	}
 
@@ -621,6 +635,11 @@ public class Statistics
 		//show extended caching/compilation statistics
 		if( DMLScript.STATISTICS ) 
 		{
+			if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 || 
+					numNativeLibMatrixDNNCalls.longValue() > 0)) {
+				String blas = NativeHelper.blasType != null ? NativeHelper.blasType : ""; 
+				sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue()  + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+			}
 			sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n");
 			sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n");
 			sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n");