You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/04/30 18:16:56 UTC

[1/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 4b6d468bb -> 39a37ae40


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
index 9bf227d..48a5bb2 100644
--- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
@@ -129,7 +129,10 @@ public abstract class AutomatedTestBase
 			}
 			else {
 				System.setProperty("java.library.path", cwd + File.separator
-					+ "\\src\\test\\config\\hadoop_bin_windows\\bin");
+					+ "\\src\\test\\config\\hadoop_bin_windows\\bin"
+					// For testing BLAS on Windows 
+					// + File.pathSeparator + "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries_2017.0.109\\windows\\redist\\intel64_win\\mkl"
+						);
 			}
 			
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
index e5528d2..0a6bbb1 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java
@@ -96,37 +96,23 @@ public class Conv2DTest extends AutomatedTestBase
 	@Test
 	public void testConv2DSparse1() 
 	{
-		int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
-	}
-	
-	@Test
-	public void testConv2DSparse2() 
-	{
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
 	@Test
-	public void testConv2DSparse3() 
+	public void testConv2DSparse2() 
 	{
 		int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
 	}
 	
-	public void testConv2DSparse4() 
+	public void testConv2DSparse3() 
 	{
 		int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1;
 		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false, true);
 	}
 	
-	@Test
-	public void testConv2DSparse5() 
-	{
-		int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
-		runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, false);
-	}
-	
 	// --------------------------------------------
 	
 
@@ -206,13 +192,6 @@ public class Conv2DTest extends AutomatedTestBase
 		runConv2DTest(ExecType.SPARK, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
 	}
 	
-	@Test
-	public void testConv2DSparse5SP() 
-	{
-		int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2;
-		runConv2DTest(ExecType.SPARK, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true, true);
-	}
-	
 	/**
 	 * 
 	 * @param et

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
index a251f7a..3a75b3f 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.R
@@ -37,13 +37,15 @@ dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
 if(as.logical(args[11])) {
 	zero_mask = (w - mean(w)) > 0 
 	w = w * zero_mask
+} else {
+	w = w - mean(w)
 }
 if(as.logical(args[12])) {
 	zero_mask = (dout - mean(dout)) > 0 
 	dout = dout * zero_mask
+} else {
+	dout = dout - mean(dout)
 }
-w = w - mean(w)
-dout = dout - mean(dout)
 col2im <- function(img_cols, C, Hin, Win, Hf, Wf,
                   strideh, stridew, reduction) {
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
index c10ac37..6630ca7 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardDataTest.dml
@@ -36,11 +36,15 @@ if($11) {
 	zero_mask = (w - mean(w)) > 0 
 	w = w * zero_mask
 }
+else {
+	w = w - mean(w)
+}
 if($12) {
 	zero_mask = (dout - mean(dout)) > 0 
 	dout = dout * zero_mask
 }
-w = w - mean(w)
-dout = dout - mean(dout)
+else {
+	dout = dout - mean(dout)
+}
 dx = conv2d_backward_data(w, dout, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], filter_shape=[numFilters, numChannels, filterSize, filterSize])
 write(dx, $10, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
index a6bbdca..7319da7 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.R
@@ -37,13 +37,15 @@ dout=matrix(seq(1, numImg*numFilters*P*Q), numImg, numFilters*P*Q, byrow=TRUE)
 if(as.logical(args[11])) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
+} else {
+	x = x - mean(x)
 }
 if(as.logical(args[12])) {
 	zero_mask = (dout - mean(dout)) > 0 
 	dout = dout * zero_mask
+} else {
+	dout = dout - mean(dout)
 }
-x = x - mean(x)
-dout = dout - mean(dout)
 pad_image <- function(img, Hin, Win, padh, padw){
   C = nrow(img)
   img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw))  # zeros

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
index c98e52b..fb14c1c 100644
--- a/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DBackwardTest.dml
@@ -36,11 +36,15 @@ if($11) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
 }
+else {
+	x = x - mean(x)
+}
 if($12) {
 	zero_mask = (dout - mean(dout)) > 0 
 	dout = dout * zero_mask
 }
-x = x - mean(x)
-dout = dout - mean(dout)
+else {
+	dout = dout - mean(dout)
+}
 dw = conv2d_backward_filter(x, dout, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], filter_shape=[numFilters, numChannels, filterSize, filterSize])
 write(dw, $10, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.R b/src/test/scripts/functions/tensor/Conv2DTest.R
index bec1ed7..6c6641f 100644
--- a/src/test/scripts/functions/tensor/Conv2DTest.R
+++ b/src/test/scripts/functions/tensor/Conv2DTest.R
@@ -35,13 +35,15 @@ w=matrix(seq(1, numFilters*numChannels*filterSize*filterSize), numFilters, numCh
 if(as.logical(args[9])) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
+} else {
+	x = x - mean(x)
 }
 if(as.logical(args[10])) {
 	zero_mask = (w - mean(w)) > 0 
 	w = w * zero_mask
+} else {
+	w = w - mean(w)
 }
-x = x - mean(x)
-w = w - mean(w)
 pad_image <- function(img, Hin, Win, padh, padw){
   C = nrow(img)
   img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw), byrow=TRUE)  # zeros

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/Conv2DTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/Conv2DTest.dml b/src/test/scripts/functions/tensor/Conv2DTest.dml
index 792367f..2eedca8 100644
--- a/src/test/scripts/functions/tensor/Conv2DTest.dml
+++ b/src/test/scripts/functions/tensor/Conv2DTest.dml
@@ -35,12 +35,16 @@ if($9) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
 }
+else {
+	x = x - mean(x)
+}
 if($10) {
 	zero_mask = (w - mean(w)) > 0 
 	w = w * zero_mask
 }
-x = x - mean(x)
-w = w - mean(w)
+else {
+	w = w - mean(w)
+}
 output = conv2d(x, w, padding=[pad, pad], stride=[stride, stride], input_shape=[numImg, numChannels, imgSize, imgSize], filter_shape=[numFilters, numChannels, filterSize, filterSize], bias=b)
 output = bias_add(output, b) 
 write(output, $8, format="text")
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/PoolTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.R b/src/test/scripts/functions/tensor/PoolTest.R
index d9c8d0c..a34e0b0 100644
--- a/src/test/scripts/functions/tensor/PoolTest.R
+++ b/src/test/scripts/functions/tensor/PoolTest.R
@@ -34,8 +34,9 @@ x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize
 if(as.logical(args[9])) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
+} else {
+	x = x - mean(x)
 }
-x = x - mean(x)
 pad_image <- function(img, Hin, Win, padh, padw){
   C = nrow(img)
   img_padded = matrix(0, C, (Hin+2*padh)*(Win+2*padw))  # zeros

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/test/scripts/functions/tensor/PoolTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.dml b/src/test/scripts/functions/tensor/PoolTest.dml
index b701e71..cc8132f 100644
--- a/src/test/scripts/functions/tensor/PoolTest.dml
+++ b/src/test/scripts/functions/tensor/PoolTest.dml
@@ -33,7 +33,9 @@ if($10) {
 	zero_mask = (x - mean(x)) > 0 
 	x = x * zero_mask
 }
-x = x - mean(x)
+else {
+	x = x - mean(x)
+}
 if(poolMode == "max") {
 	output = max_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2])
 }


[2/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

Posted by ni...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f839990..c794932 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,15 +21,20 @@ package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
 
 public class ConvolutionCPInstruction extends UnaryCPInstruction 
 {	
@@ -131,7 +136,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			return new ConvolutionCPInstruction(in, out, opcode, str, stride,
 					padding, input_shape, filter_shape, k);
 		} 
-		else if (opcode.equalsIgnoreCase("maxpooling_backward")
+		else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
 				|| opcode.equalsIgnoreCase("conv2d")
 				|| opcode.equalsIgnoreCase("conv2d_backward_filter")
 				|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
@@ -288,6 +293,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 		ec.setMatrixOutput(getOutputVariableName(), outputBlock);
 	}
 	
+	// Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true
+	// This increases the number of native calls. For example:the cases where filter is sparse but input is dense
+	private boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException {
+		long numElems = filter.getNumRows()*filter.getNumColumns();
+		// if filter is less than 10 MB in dense format (which handles almost all the cases).
+		// In fact, using threshold of 1 MB is still sufficient for common CNNs.
+		if(filter.isInSparseFormat() && numElems < 10e+6)
+			filter.sparseToDense(); 
+		return filter.isInSparseFormat();
+	}
+	
 	
 	@Override
 	public void processInstruction(ExecutionContext ec)
@@ -326,6 +342,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 		int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 		
 		ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
+		params.enableNative = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS) && NativeHelper.isNativeLibraryLoaded();
 		if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
 			if(matBlock.isEmptyBlock()) {
 				outputBlock = new MatrixBlock(N, C*P*Q, true);
@@ -337,14 +354,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 				LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
 			}
 		}
-		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
+		else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
 			MatrixBlock dout = ec.getMatrixInput(_in2.getName());
 			if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) {
 				outputBlock = new MatrixBlock(N, C*H*W, true);
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, C*H*W);
-				LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params);
+				if(instOpcode.equalsIgnoreCase("maxpooling_backward"))
+					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false);
+				else
+					LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -355,7 +375,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, K*P*Q);
-				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+				if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+					LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+				else
+					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -367,9 +390,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, K*P*Q);
-				if(!bias.isEmptyBlock())
+				if(!bias.isEmptyBlock()) {
 					params.bias = bias;
-				LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+				}
+				if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+					LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+				else
+					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in3.getName());
 			ec.releaseMatrixInput(_in2.getName());
@@ -381,7 +408,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(K, C*R*S);
-				LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+				if(params.enableNative && !matBlock.isInSparseFormat() && !dout.isInSparseFormat())
+					LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+				else
+					LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}
@@ -392,7 +422,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
 			}
 			else {
 				outputBlock = getDenseOutputBlock(N, C * H * W);
-				LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
+				if(params.enableNative && !isFilterSparse(matBlock) && !dout.isInSparseFormat())
+					LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
+				else
+					LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
 			}
 			ec.releaseMatrixInput(_in2.getName());
 		}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 497b182..9d4cd1f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -141,7 +141,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
 					padding, input_shape, filter_shape);
 		}
-		else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
+		else if (opcode.equalsIgnoreCase("maxpooling")) {
 			InstructionUtils.checkNumFields(parts, 14);
 			CPOperand in1 = new CPOperand(parts[1]);
 			CPOperand out = new CPOperand(parts[14]);
@@ -303,7 +303,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
-		else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
 
 			if(image.getNumRows() != N || image.getNumColumns() != C*H*W) 
@@ -315,9 +315,6 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			if(instOpcode.equalsIgnoreCase("maxpooling"))
 				LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-			else
-				LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
-						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
 			MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -341,7 +338,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 		// release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 
-		if (!( instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) )
+		if ( !instOpcode.equalsIgnoreCase("maxpooling") )
 			ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
 		if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
index b485233..f687cc0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
@@ -25,6 +25,8 @@ import java.util.Iterator;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.broadcast.Broadcast;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -41,6 +43,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -281,7 +284,8 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 			int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
 			
 			ConvolutionParameters params = new ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
-			JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows()), true);
+			boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+			JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
 			
 			//put output RDD handle into symbol table
 			sec.setRDDHandleForVariable(output.getName(), out);
@@ -316,15 +320,16 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 		Broadcast<MatrixBlock> filterBroadcast = null;
 		Broadcast<MatrixBlock> biasBroadcast = null;
 		ConvolutionParameters params = null;
-		String instOpcode = null;
+		String instOpcode = null; boolean enableNative;
 		long numRows = 0;
 		public RDDConv2dMapMMFunction(Broadcast<MatrixBlock> filterBroadcast, 
-				ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows) {
+				ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows, boolean enableNativeBLAS) {
 			this.filterBroadcast = filterBroadcast;
 			this.params = params;
 			this.instOpcode = instOpcode;
 			this.biasBroadcast = biasBroadcast;
 			this.numRows = numRows;
+			this.enableNative = enableNativeBLAS;
 		}
 		
 		private MatrixBlock processRectangularBlock(MatrixBlock matBlock) throws Exception {
@@ -336,7 +341,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 				}
 				else {
 					outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
-					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+					if(enableNative)
+						LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+					else
+						LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 				}
 			}
 			else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -349,7 +357,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
 					outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
 					if(!bias.isEmptyBlock())
 						params.bias = bias;
-					LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+					if(enableNative)
+						LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+					else
+						LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
 				}
 			}
 			else if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 3f0437f..11e74ca 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -34,7 +34,7 @@ public class ConvolutionParameters implements Serializable {
 	public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
 	public int P; public int Q; public int numThreads;
 	
-	
+	public boolean enableNative = false;
 	public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
 	
 	public MatrixBlock bias;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 9ddd87a..56360f8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1043,41 +1043,6 @@ public class LibMatrixCUDA {
 	}
 	
 	/**
-	 * performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...)
-	 * @param gCtx   a valid {@link GPUContext}
-	 * @param instName the invoking instruction's name for record {@link Statistics}.
-	 * @param image image as matrix object
-	 * @param outputBlock output matrix
-	 * @param N				batch size
-	 * @param C				number of channels
-	 * @param H				height of image
-	 * @param W				width of image
-	 * @param K				number of filters
-	 * @param R				height of filter
-	 * @param S				width of filter
-	 * @param pad_h			vertical padding
-	 * @param pad_w			horizontal padding
-	 * @param stride_h		horizontal stride
-	 * @param stride_w		vertical stride
-	 * @param P				(H - R + 1 + 2*pad_h)/stride_h
-	 * @param Q				(W - S + 1 + 2*pad_w)/stride_w
-	 * @throws DMLRuntimeException if DMLRuntimeException occurs
-	 */
-	public static void reluMaxpooling(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
-			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
-			int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
-        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
-		long size  = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE;
-		Pointer tmp = gCtx.allocate(size);
-		performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
-		//cudaDeviceSynchronize; // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
-		performMaxpooling(gCtx, instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-		gCtx.cudaFreeHelper(tmp);
-	}
-
-	/**
 	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 	 * This method computes the backpropogation errors for previous layer of maxpooling operation
 	 * @param gCtx   a valid {@link GPUContext}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index cd88b95..6a43917 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -36,6 +36,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 /**
  * This class allows users to invoke deep learning related operations 
@@ -57,10 +59,10 @@ public class LibMatrixDNN {
 	public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output
 
 	private enum TaskType {
-		MaxPooling_Forward, MaxPooling_Backward, 
+		MaxPooling_Forward, MaxPooling_Backward, MaxPooling_Relu_Backward,
 		// Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
 		LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData,
-		BiasAdd, ReluBackward, BiasMultiply
+		ReluBackward
 	}
 	
 	// ------------------------------------------------------------------------------------------------
@@ -141,6 +143,30 @@ public class LibMatrixDNN {
 	// ------------------------------------------------------------------------------------------------
 	
 	/**
+	 * This method performs convolution (i.e. cross-correlation) operation on input
+	 * 
+	 * @param input input batch 
+	 * @param filter filter
+	 * @param outputBlock output of convolution
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+		
+		if(params.bias != null && params.bias.isInSparseFormat())
+			params.bias.sparseToDense(); // Since bias is extremely small array
+		
+		if(isEligibleForConv2dSparse(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
+		
+		runConvTask(TaskType.LoopedIm2ColConv2d, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
+	}
+	
+	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
 	 * 
 	 * @param filter filter used in conv2d 
@@ -150,25 +176,10 @@ public class LibMatrixDNN {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = filter;
-		params.input2 = dout;
-		params.output = outputBlock;
-		if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S || 
-				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
-			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
-		}
-		if(params.stride_h <= 0 || params.stride_w <= 0) {
-			throw new DMLRuntimeException("Only positive strides supported");
-		}
+		checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
 		
-		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
-			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
-				conv2dBwdDataSparseCount.addAndGet(1);
-			}
-			else {
-				conv2dBwdDataDenseCount.addAndGet(1);
-			}
-		}
+		if(isEligibleForConv2dBackwardDataDense(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
 		
@@ -186,17 +197,71 @@ public class LibMatrixDNN {
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
 	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
-		params.input1 = input;
+		checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+		
+		if(isEligibleForConv2dBackwardFilterSparseDense(params))
+			Statistics.numNativeLibMatrixDNNCalls.increment();
+		
+		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
+		
+		//post-processing: maintain nnz
+		outputBlock.recomputeNonZeros();
+	}
+	
+	
+	private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException {
+		if(lhs != rhs)
+			throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs);
+	}
+	private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
+		if(lhs != (rhs1*rhs2*rhs3))
+			throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3);
+	}
+	
+	static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  throws DMLRuntimeException {
+		params.input1 = filter;
 		params.input2 = dout;
 		params.output = outputBlock;
-		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || 
-				dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
-			throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
-		}
-		if(params.stride_h <= 0 || params.stride_w <= 0) {
-			throw new DMLRuntimeException("Only positive strides supported");
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != "
+				+ "number of filters in filter_shape", filter.getNumRows(), params.K);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != "
+				+ "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != "
+				+ "batch size in input_shape", dout.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != "
+				+ "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+		
+		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+			if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
+				conv2dBwdDataSparseCount.addAndGet(1);
+			}
+			else {
+				conv2dBwdDataDenseCount.addAndGet(1);
+			}
 		}
 		
+		int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+		if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1))
+			params.numThreads = 1;
+	}
+	
+	static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  throws DMLRuntimeException {
+		params.input1 = input;
+		params.input2 = dout;
+		params.output = outputBlock;
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != "
+				+ "batch size in input_shape", input.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != "
+				+ "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != "
+				+ "batch size in input_shape", dout.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != "
+				+ "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			if(input.isInSparseFormat() || dout.isInSparseFormat()) {
 				conv2dBwdFilterSparseCount.addAndGet(1);
@@ -205,11 +270,6 @@ public class LibMatrixDNN {
 				conv2dBwdFilterDenseCount.addAndGet(1);
 			}
 		}
-		
-		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
-		
-		//post-processing: maintain nnz
-		outputBlock.recomputeNonZeros();
 	}
 	
 	/**
@@ -252,11 +312,10 @@ public class LibMatrixDNN {
 		MatrixBlock filter = params.input1;
 		MatrixBlock dout = params.input2;
 		doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
-		dout_reshaped.recomputeNonZeros();
 		
 		MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
-		LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
+		singleThreadedMatMult(dout_reshaped, filter, temp, true, false, params);
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		doCol2imOverSingleImage(n, temp, params);
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
@@ -270,15 +329,13 @@ public class LibMatrixDNN {
 			MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params, double []  tempIm2ColArr) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		doIm2col(n, im2ColOutBlock, params, tempIm2ColArr);
-		im2ColOutBlock.recomputeNonZeros();
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		
 		doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
-		dout_reshaped.recomputeNonZeros();
 		
 		MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
-		LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
+		singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, params);
 		long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
@@ -298,15 +355,21 @@ public class LibMatrixDNN {
 		ret[2] = j % W;
 	}
 	
-	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+	static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = filter;
 		params.output = outputBlock;
 		
-		if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || 
-				filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) {
-			throw new DMLRuntimeException("Incorrect input to conv2d: " + input.getNumRows());
-		}
+		checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != "
+				+ "number of filters in filter_shape", filter.getNumRows(), params.K);
+		checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != "
+				+ "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+		checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != "
+				+ "batch size in input_shape", input.getNumRows(), params.N);
+		checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != "
+				+ "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+		if(params.stride_h <= 0 || params.stride_w <= 0) 
+			throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
 		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			if(input.isInSparseFormat() || filter.isInSparseFormat()) {
@@ -316,21 +379,36 @@ public class LibMatrixDNN {
 				conv2dDenseCount.addAndGet(1);
 			}
 		}
-		
-		runConvTask(TaskType.LoopedIm2ColConv2d, params);
-		
-		//post-processing: maintain nnz
-		outputBlock.recomputeNonZeros();
+	}
+	
+	// Single-threaded matrix multiplication
+	private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, 
+			boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException {
+		if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) {
+			if(recomputeNNZM1)
+				m1.recomputeNonZeros();
+			if(recomputeNNZM2)
+				m2.recomputeNonZeros();
+			LibMatrixMult.matrixMult(m1, m2, ret, false);
+		}
+		else {
+			ret.sparse = false;
+			if(ret.getDenseBlock() == null)
+				ret.allocateDenseBlock();
+			NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
+					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1);
+			ret.recomputeNonZeros();
+		}
 	}
 	
 	private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params, double []  temp) throws DMLRuntimeException {
 		long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		doIm2col(n, im2ColOutBlock, params, temp);
-		im2ColOutBlock.recomputeNonZeros();
 		long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		
 		MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
-		LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
+		singleThreadedMatMult(params.input2, im2ColOutBlock, matMultOutBlock, false, true, params);
+		
 		long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
 		
 		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
@@ -380,9 +458,11 @@ public class LibMatrixDNN {
 	 * @param dout dout matrix
 	 * @param outputBlock output matrix
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU backward
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+	public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, 
+			ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		params.input1 = input;
 		params.input2 = dout;
 		params.output = outputBlock;
@@ -407,7 +487,10 @@ public class LibMatrixDNN {
 			throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");
 
 		fillIndexesArray(params);
-		runConvTask(TaskType.MaxPooling_Backward, params);
+		if(performReluBackward)
+			runConvTask(TaskType.MaxPooling_Relu_Backward, params);
+		else
+			runConvTask(TaskType.MaxPooling_Backward, params);
 		
 		//post-processing: maintain nnz 
 		outputBlock.recomputeNonZeros();
@@ -440,7 +523,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackward(int n, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		double [] inputArray = null;
 		if (!params.input1.isInSparseFormat())
 			inputArray = params.input1.getDenseBlock();
@@ -455,19 +538,19 @@ public class LibMatrixDNN {
 			
 		if(inputArray != null) {
 			if(doutArray != null)
-				doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
+				doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params, performReluBackward);
 			else
-				doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
+				doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params, performReluBackward);
 		}
 		else {
 			if(doutArray != null)
-				doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
+				doPoolingBackwardSparseDense(n, doutArray, outputArray, params, performReluBackward);
 			else
-				doPoolingBackwardSparseSparse(n, outputArray, params);
+				doPoolingBackwardSparseSparse(n, outputArray, params, performReluBackward);
 		}
 	}
 	
-	private static void doPoolingBackwardSparseDense(int n, double [] doutArray,  double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackwardSparseDense(int n, double [] doutArray,  double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
 		
@@ -477,7 +560,7 @@ public class LibMatrixDNN {
 					double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + q];
 					if(inVal != 0) {
 						final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-						int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+						int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
 						if(maxIndex != -1)
 							outputArray[maxIndex] += inVal;
 					}
@@ -486,7 +569,7 @@ public class LibMatrixDNN {
 		}
 	}
 	
-	private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+	private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if (!params.input1.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
 		
@@ -502,7 +585,7 @@ public class LibMatrixDNN {
 				int p = tensorIndexes[1];
 				int q = tensorIndexes[2];
 				final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-				int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+				int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
 				if(maxIndex != -1)
 					outputArray[maxIndex] += avals[j];
 			}
@@ -510,7 +593,7 @@ public class LibMatrixDNN {
 	}
 	
 	private static void doPoolingBackwardDenseSparse(int n, double [] inputArray, 
-			MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+			MatrixBlock dout, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if( !dout.sparseBlock.isEmpty(n) ) {
 			int [] tensorIndexes = new int[3];
 			int apos = dout.sparseBlock.pos(n);
@@ -523,7 +606,7 @@ public class LibMatrixDNN {
 				int p = tensorIndexes[1];
 				int q = tensorIndexes[2];
 				final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
-				int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+				int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
 				if(maxIndex != -1)
 					outputArray[maxIndex] += avals[j];
 			}
@@ -531,14 +614,14 @@ public class LibMatrixDNN {
 	}
 	
 	private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray, 
-			double [] outputArray, ConvolutionParameters params) {
+			double [] outputArray, ConvolutionParameters params, boolean performReluBackward) {
 		for (int c = 0; c < params.C; c++) {
 			final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
 			final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q;
 			
 			for (int p = 0; p < params.P; p++) {
 				for (int q = 0; q < params.Q; q++) {
-					int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+					int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
 					if(maxIndex != -1)
 						outputArray[maxIndex] += doutArray[outputOffset +  p * params.Q + q];
 				}
@@ -556,10 +639,11 @@ public class LibMatrixDNN {
 	 * @param c number of channels 
 	 * @param input input matrix
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU on input
 	 * @return index of the cell with maximum value
 	 * @throws DMLRuntimeException if error occurs
 	 */
-	private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
+	private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
 		if(!input.isInSparseFormat())
 			throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");
 		
@@ -591,9 +675,10 @@ public class LibMatrixDNN {
 				int h = tensorIndexes[1];
 				int w = tensorIndexes[2];
 				if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
-					if(maxVal < avals[j]) {
+					double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; 
+					if(maxVal < val) {
 						maxIndex = inputOffset +  h*params.W + w;
-						maxVal = avals[j];
+						maxVal = val;
 					}
 				}
 			}
@@ -612,9 +697,10 @@ public class LibMatrixDNN {
 	 * @param inputOffset offset to be used for input index
 	 * @param inputArray input array
 	 * @param params convolution parameters
+	 * @param performReluBackward perform ReLU backward
 	 * @return index of cell with maximum value
 	 */
-	private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
+	private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) {
 		int start_index_h = params.start_indexes_h[p];
 		int end_index_h = params.end_indexes_h[p];
 		int start_index_w = params.start_indexes_w[q];
@@ -632,6 +718,7 @@ public class LibMatrixDNN {
 		for (int h = start_index_h; h < end_index_h; h++) {
 			for (int w = start_index_w; w < end_index_w; w++) {
 				currDoutVal = inputArray[inputOffset +  h*params.W + w];
+				currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal;
 				if(maxVal < currDoutVal) {
 					maxIndex = inputOffset +  h*params.W + w;
 					maxVal = currDoutVal;
@@ -894,11 +981,15 @@ public class LibMatrixDNN {
 	private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params, 
 			ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
 			ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
+		boolean isEligibleForConv2dSparse = (type == TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params);
+		boolean isEligibleForConv2dBackwardFilterSparseDense = (type == TaskType.LoopedIm2ColConv2dBwdFilter) && isEligibleForConv2dBackwardFilterSparseDense(params) ;
 		for(int i = 0; i < poolSize; i++) {
 			if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
-				MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-				im2ColOutBlock.allocateDenseBlock();
-				im2ColOutBlocks.add(im2ColOutBlock);
+				if(!isEligibleForConv2dSparse && !isEligibleForConv2dBackwardFilterSparseDense) {
+					MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+					im2ColOutBlock.allocateDenseBlock();
+					im2ColOutBlocks.add(im2ColOutBlock);
+				}
 			}
 			
 			if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
@@ -962,6 +1053,21 @@ public class LibMatrixDNN {
 	}
 	// ----------------------------------------------------------------------------------------------------------------
 	
+	private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
+		// NativeHelper.conv2dBackwardFilterSparseDense only if filter is sparse. 
+		// dout converted to dense if sparse.
+		return params.enableNative && params.input1.isInSparseFormat();
+	}
+	private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) {
+		// NativeHelper.conv2dSparse only if filter is dense and input is sparse
+		return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat();
+	}
+	private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
+		// NativeHelper.conv2dBackwardDataDense only if filter is dense. 
+		// dout converted to dense if sparse.
+		return params.enableNative && !params.input1.isInSparseFormat();
+	}
+	
 	/**
 	 * The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc)
 	 * to be executed in multi-thread manner.
@@ -1001,55 +1107,108 @@ public class LibMatrixDNN {
 					break;
 				case MaxPooling_Backward:
 					for(int n = _rl; n < _ru; n++) 
-						doPoolingBackward(n, _params);
-					break;
-				case BiasAdd:
-				{
-					double [] dest = _params.output.getDenseBlock();
-					ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, 
-							_rl, _ru, _binaryElementWiseAddition);
+						doPoolingBackward(n, _params, false);
 					break;
-				}
-				case BiasMultiply:
-				{
-					double [] dest = _params.output.getDenseBlock();
-					ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, 
-							_rl, _ru, _binaryElementWiseMultiplication);
+				case MaxPooling_Relu_Backward:
+					for(int n = _rl; n < _ru; n++) 
+						doPoolingBackward(n, _params, true);
 					break;
-				}
 				case ReluBackward:
 					lnnz = doReluBackward(_params, _rl, _ru);
 					break;
 				case LoopedIm2ColConv2d:
 				{	
-					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
-					double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
-					for(int n = _rl; n < _ru; n++) 
-						doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
-					_im2ColOutBlocks.add(im2ColOutBlock);
-					if(_params.bias != null)
-						ConvolutionUtils.binaryBiasOperationInPlace(_params.bias, _params.output.getDenseBlock(), _params.K, 
-								_params.P*_params.Q, _rl, _ru, _binaryElementWiseAddition);
+					if(isEligibleForConv2dSparse(_params)) {
+						// NativeHelper.conv2dSparse only if filter is dense and input is sparse
+						int KPQ = _params.K*_params.P*_params.Q;
+						double[] temp = new double[KPQ];
+						for(int n = _rl; n < _ru; n++)  {
+							if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+								int apos = _params.input1.getSparseBlock().pos(n);
+								int alen = _params.input1.getSparseBlock().size(n);
+								int[] aix = _params.input1.getSparseBlock().indexes(n);
+								double[] avals = _params.input1.getSparseBlock().values(n);
+								NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp, 
+										1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, 
+										_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+								System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ);
+							}
+						}
+					}
+					else {
+						// In all other cases, perform im2col in Java + matmult (either native or java).
+						MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+						double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+						for(int n = _rl; n < _ru; n++) 
+							doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
+						_im2ColOutBlocks.add(im2ColOutBlock);
+					}
+					if(_params.bias != null) {
+						// bias is always converted to dense format
+						double [] biasArr = _params.bias.getDenseBlock();
+						int PQ = _params.P*_params.Q;
+						int index = _rl*_params.K*PQ;
+						for(int n = _rl; n < _ru; n++) {
+							for(int k = 0; k < _params.K; k++) {
+								for(int pq = 0; pq < PQ; pq++, index++) {
+									_params.output.denseBlock[index] += biasArr[k];
+								}
+							}
+						}
+					}
 					break;
 				}
 				case LoopedIm2ColConv2dBwdFilter:
 				{
-					MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
 					MatrixBlock partialRetBlock = _partialRetBlocks.remove();
 					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
-					double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
-					for(int n = _rl; n < _ru; n++) 
-						partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
-					_im2ColOutBlocks.add(im2ColOutBlock);
-					_partialRetBlocks.add(partialRetBlock);
+					if(isEligibleForConv2dBackwardFilterSparseDense(_params)) {
+						double [] dout_n = doutReshapedBlock.getDenseBlock();
+						for(int n = _rl; n < _ru; n++) {
+							if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+								doRotate180(n, 0, _params.input2, dout_n, _params, true);
+								int apos = _params.input1.getSparseBlock().pos(n);
+								int alen = _params.input1.getSparseBlock().size(n);
+								int[] aix = _params.input1.getSparseBlock().indexes(n);
+								double[] avals = _params.input1.getSparseBlock().values(n);
+								NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
+										dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, _params.K, 
+										_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+							}
+						}
+					}
+					else {
+						MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+						double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+						for(int n = _rl; n < _ru; n++) 
+							partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
+						_im2ColOutBlocks.add(im2ColOutBlock);
+					}
 					_doutReshapedBlocks.add(doutReshapedBlock);
+					_partialRetBlocks.add(partialRetBlock);
 					break;
 				}
 				case LoopedIm2ColConv2dBwdData:
 				{
 					MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
-					for(int n = _rl; n < _ru; n++) 
-						doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+					if(isEligibleForConv2dBackwardDataDense(_params)) {
+						int CHW = _params.C*_params.H*_params.W;
+						double [] ret = new double[CHW];
+						double [] filterArr = _params.input1.getDenseBlock();
+						for(int n = _rl; n < _ru; n++) {
+							double [] dout_n = getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock());
+							if(n > _rl)
+								Arrays.fill(ret, 0);
+							NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, 
+									_params.C, _params.H, _params.W, _params.K, 
+									_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+							System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW);
+						}
+					}
+					else {
+						for(int n = _rl; n < _ru; n++) 
+							doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+					}
 					_doutReshapedBlocks.add(doutReshapedBlock);
 					break;
 				}
@@ -1192,16 +1351,24 @@ public class LibMatrixDNN {
 	}
 	
 	// Returns the row of matrix in dense format
-	private static double [] getRowInDenseFormat(MatrixBlock input, int n, double []  temp) {
+	private static double [] getRowInDenseFormat(MatrixBlock input, int n, double []  temp) throws DMLRuntimeException {
+		if(input.getNumColumns() != temp.length) {
+			throw new DMLRuntimeException("Invalid parameters");
+		}
 		// Use temporary array to avoid binary search
-		Arrays.fill(temp, 0);
-		if( !input.sparseBlock.isEmpty(n) ) {
-			int apos = input.sparseBlock.pos(n);
-			int alen = input.sparseBlock.size(n);
-			int[] aix = input.sparseBlock.indexes(n);
-			double[] avals = input.sparseBlock.values(n);
-			for(int j=apos; j<apos+alen; j++)
-				temp[ aix[j] ] = avals[j];
+		if(input.isInSparseFormat()) {
+			Arrays.fill(temp, 0);
+			if( !input.sparseBlock.isEmpty(n) ) {
+				int apos = input.sparseBlock.pos(n);
+				int alen = input.sparseBlock.size(n);
+				int[] aix = input.sparseBlock.indexes(n);
+				double[] avals = input.sparseBlock.values(n);
+				for(int j=apos; j<apos+alen; j++)
+					temp[ aix[j] ] = avals[j];
+			}
+		}
+		else {
+			System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns());
 		}
 		return temp;
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 5c7c2ef..983ce53 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -17,7 +17,6 @@
  * under the License.
  */
 
-
 package org.apache.sysml.runtime.matrix.data;
 
 import java.util.ArrayList;
@@ -123,7 +122,7 @@ public class LibMatrixMult
 	{
 		matrixMult(m1, m2, ret, rl, ru, true);
 	}
-
+	
 	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) 
 		throws DMLRuntimeException
 	{	
@@ -132,9 +131,9 @@ public class LibMatrixMult
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-		
+			
 		//Timing time = new Timing(true);
-		
+			
 		//pre-processing: output allocation
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
@@ -166,6 +165,7 @@ public class LibMatrixMult
 		if(examSparsity)
 			ret.examSparsity();
 		
+		
 		//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
 		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}
@@ -188,7 +188,7 @@ public class LibMatrixMult
 			ret.examSparsity(); //turn empty dense into sparse
 			return;
 		}
-		
+			
 		//check too high additional vector-matrix memory requirements (fallback to sequential)
 		//check too small workload in terms of flops (fallback to sequential too)
 		if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse()) 
@@ -247,6 +247,7 @@ public class LibMatrixMult
 			throw new DMLRuntimeException(ex);
 		}
 		
+		
 		//post-processing (nnz maintained in parallel)
 		ret.examSparsity();
 		
@@ -3960,4 +3961,4 @@ public class LibMatrixMult
 			return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
 		}
 	}
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
new file mode 100644
index 0000000..4b12596
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixNative {
+	
+	// We could encapsulate heuristics in this function
+	// For now, we only consider matrix-vector operation to be memory bound
+	private static boolean isMatMultMemoryBound(int m1Rlen, int m1Clen, int m2Clen) {
+		return m1Rlen == 1 || m1Clen == 1 || m2Clen == 1;
+	}
+
+	/**
+	 * Performs matrix multiplication using native library if BLAS is available or else falls back to
+	 * Java BLAS.
+	 * 
+	 * @param m1 lhs matrix block
+	 * @param m2 rhs matrix block
+	 * @param ret output matrix block
+	 * @param k number of threads
+	 * @throws DMLRuntimeException if error occurs
+	 */
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException {
+		matrixMult(m1, m2, ret, k, true);
+	}
+	
+	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, boolean examSparsity) throws DMLRuntimeException {
+		// Sanity check:
+		k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
+		
+		// check inputs / outputs
+		if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+			ret.setNonZeros(0);
+			if(examSparsity)
+				ret.examSparsity(); // turn empty dense into sparse
+			return;
+		}
+		if (NativeHelper.isNativeLibraryLoaded() && 
+				!isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+			ret.sparse = false;
+			ret.allocateDenseBlock();
+			if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
+					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
+				Statistics.numNativeLibMatrixMultCalls.increment();
+				ret.recomputeNonZeros();
+				// post-processing (nnz maintained in parallel)
+				if(examSparsity)
+					ret.examSparsity();
+				return;
+			} else {
+				// Else fall back to Java
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		if (k == 1)
+			LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
+		else
+			LibMatrixMult.matrixMult(m1, m2, ret, k);
+	}
+	
+	/**
+	 * This method performs convolution (i.e. cross-correlation) operation on input
+	 * 
+	 * @param input input batch 
+	 * @param filter filter
+	 * @param outputBlock output of convolution
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
+			setNumThreads(params);
+			if(params.bias == null) {
+				if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+					Statistics.numNativeLibMatrixDNNCalls.increment();
+					// post-processing: maintain nnz
+					outputBlock.recomputeNonZeros();
+					return;
+				}
+				else {
+					// Fall back to Java when failures
+					Statistics.incrementNativeFailuresCounter();
+				}
+			}
+			else {
+				if(params.bias.isInSparseFormat())
+					params.bias.sparseToDense(); // Bias matrix is usually extremely small
+				if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, 
+						params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+					Statistics.numNativeLibMatrixDNNCalls.increment();
+					// post-processing: maintain nnz
+					outputBlock.recomputeNonZeros();
+					return;
+				}
+				else {
+					// Fall back to Java when failures
+					Statistics.incrementNativeFailuresCounter();
+				}
+			}
+		}
+		
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2d(input, filter, outputBlock, params);
+	}
+	
+	private static void setNumThreads(ConvolutionParameters params) {
+		params.numThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+		if (!(params.isOutputThreadSafe() && params.numThreads > 1))
+			params.numThreads = 1;
+	}
+	
+	/**
+	 * This method computes the backpropogation errors for filter of convolution operation
+	 * 
+	 * @param input input image 
+	 * @param dout errors from next layer
+	 * @param outputBlock  output errors
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
+			setNumThreads(params);
+			if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+				Statistics.numNativeLibMatrixDNNCalls.increment();
+				// post-processing: maintain nnz
+				outputBlock.recomputeNonZeros();
+				return;
+			}
+			else {
+				// Fall back to Java when failures
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2dBackwardFilter(input, dout, outputBlock, params);
+	}
+	
+	/**
+	 * This method computes the backpropogation errors for previous layer of convolution operation
+	 * 
+	 * @param filter filter used in conv2d 
+	 * @param dout errors from next layer
+	 * @param outputBlock  output errors
+	 * @param params convolution parameters
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+		LibMatrixDNN.checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
+		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
+			setNumThreads(params);
+			if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+						params.P, params.Q, params.numThreads)) {
+				Statistics.numNativeLibMatrixDNNCalls.increment();
+				// post-processing: maintain nnz
+				outputBlock.recomputeNonZeros();
+				return;
+			}
+			else {
+				// Fall back to Java when failures
+				Statistics.incrementNativeFailuresCounter();
+			}
+		}
+		// Fall back to Java when failures or sparse
+		LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index c458b07..8fac947 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -35,6 +35,7 @@ import java.util.stream.LongStream;
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.MMTSJ.MMTSJType;
@@ -4873,15 +4874,26 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		
 		return sum_wt;
 	}
+	
+	public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
+			MatrixValue result, AggregateBinaryOperator op ) throws DMLRuntimeException
+	{
+		boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+	}
 
 	public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
-			MatrixValue result, AggregateBinaryOperator op ) 
-		throws DMLRuntimeException
+			MatrixValue result, AggregateBinaryOperator op, boolean enableNativeBLAS ) throws DMLRuntimeException
 	{
-		return aggregateBinaryOperations(m1Value, m2Value, result, op);
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+	}
+	
+	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) throws DMLRuntimeException {
+		boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+		return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
 	}
 
-	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) 
+	public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op, boolean nativeMatMult) 
 		throws DMLRuntimeException
 	{
 		//check input types, dimensions, configuration
@@ -4907,7 +4919,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros);
 		
 		//compute matrix multiplication (only supported binary aggregate operation)
-		if( op.getNumThreads() > 1 )
+		if( nativeMatMult )
+			LibMatrixNative.matrixMult(m1, m2, ret, op.getNumThreads());
+		else if( op.getNumThreads() > 1 )
 			LibMatrixMult.matrixMult(m1, m2, ret, op.getNumThreads());
 		else
 			LibMatrixMult.matrixMult(m1, m2, ret);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index b988546..8e51405 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.util;
 import java.util.Arrays;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Plus;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
@@ -58,12 +60,20 @@ public class ConvolutionUtils {
 		}
 		return ret;
 	}
+
 	
-	// Performs dest[destPos ...] <- src[src_rl:src_ru, ]
-	//Assumes that dest is zeroed-out before calling
-	public static void copy(MatrixBlock src, double [] dest, int destPos, int destNumCols, int src_rl, int src_ru) {
+	// Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
+	public static void binaryOperationInPlace(MatrixBlock src, double [] dest, 
+			int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
 		if(src.isInSparseFormat()) {
-			if(!src.isEmptyBlock()) {
+			if(src.isEmptyBlock() && op.fn == Plus.getPlusFnObject()) {
+				// Do nothing: Inplace addition by zero
+			}
+			else if(src.isEmptyBlock() && op.fn == Multiply.getMultiplyFnObject()) {
+				// Inplace multiplication by zero
+				Arrays.fill(dest, destPos, destPos + (src_ru-src_rl)*destNumCols, 0);
+			}
+			else if(op.fn == Plus.getPlusFnObject()) {
 				for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
 					if( !src.getSparseBlock().isEmpty(i) ) {
 						int apos = src.getSparseBlock().pos(i);
@@ -71,37 +81,54 @@ public class ConvolutionUtils {
 						int[] aix = src.getSparseBlock().indexes(i);
 						double[] avals = src.getSparseBlock().values(i);
 						for(int j = apos; j < apos+alen; j++) {
-							dest[ cix+aix[j] ] = avals[j];
+							dest[ cix+aix[j] ] += avals[j];
 						}
 					}
 				}
 			}
-		}
-		else {
-			System.arraycopy(src.getDenseBlock(), src_rl*src.getNumColumns(), dest, destPos, (src_ru-src_rl)*src.getNumColumns());
-		}
-	}
-	
-	// Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
-	public static void binaryOperationInPlace(MatrixBlock src, double [] dest, 
-			int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
-		if(src.isInSparseFormat()) {
-			for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
-				if( !src.getSparseBlock().isEmpty(i) ) {
-					int apos = src.getSparseBlock().pos(i);
-					int alen = src.getSparseBlock().size(i);
-					int[] aix = src.getSparseBlock().indexes(i);
-					double[] avals = src.getSparseBlock().values(i);
-					for(int j = apos; j < apos+alen; j++) {
-						dest[ cix+aix[j] ] = op.fn.execute(dest[ cix+aix[j] ], avals[j]);
+			else if(op.fn == Multiply.getMultiplyFnObject()) {
+				// Unsafe operation
+				for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
+					if( !src.getSparseBlock().isEmpty(i) ) {
+						int apos = src.getSparseBlock().pos(i);
+						int alen = src.getSparseBlock().size(i);
+						int[] aix = src.getSparseBlock().indexes(i);
+						double[] avals = src.getSparseBlock().values(i);
+						int prevDestIndex = 0;
+						for(int j = apos; j < apos+alen; j++) {
+							// Multiplication by zero. Assumption: aix is sorted.
+							Arrays.fill(dest, cix+prevDestIndex, aix[j], 0);
+							prevDestIndex = aix[j]+1;
+							dest[ cix+aix[j] ] *= avals[j];
+						}
+						Arrays.fill(dest, cix+prevDestIndex, cix+destNumCols, 0);
+					}
+					else {
+						Arrays.fill(dest, cix, cix + destNumCols, 0);
 					}
 				}
 			}
+			else {
+				// As operation could be safe or unsafe. This will be caught at development time.
+				throw new DMLRuntimeException("Unimplemented sparse operation");
+			}
 		}
 		else {
 			double [] inputArr = src.getDenseBlock();
-			for(int i = destPos; i < src_ru*destNumCols; i++) {
-				dest[i] = op.fn.execute(dest[i], inputArr[i]);
+			if(op.fn == Plus.getPlusFnObject()) {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] += inputArr[i];
+				}
+			}
+			else if(op.fn == Multiply.getMultiplyFnObject()) {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] *= inputArr[i];
+				}
+			}
+			else {
+				for(int i = destPos; i < src_ru*destNumCols; i++) {
+					dest[i] = op.fn.execute(dest[i], inputArr[i]);
+				}
 			}
 		}
 	}
@@ -130,45 +157,6 @@ public class ConvolutionUtils {
 		}
 	}
 	
-	// dest (of size N x KPQ) = input (of size N x KPQ) op bias (of size K x 1)
-	public static void binaryBiasOperations(MatrixBlock input, MatrixBlock bias, double [] dest, 
-			int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
-		copy(input, dest, rl*K*PQ, K*PQ, rl, ru);
-		binaryBiasOperationInPlace(bias, dest, K, PQ, rl, ru, op);
-	}
-	
-	// dest (of size N x KPQ) op= bias (of size K x 1)
-	public static void binaryBiasOperationInPlace(MatrixBlock bias, double [] dest, 
-			int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
-		// bias.getNumColumns() == 1 checked outside
-		if(!bias.isInSparseFormat()) {
-			double [] biasArr = bias.getDenseBlock();
-			int index = rl*K*PQ;
-			for(int n = rl; n < ru; n++) {
-				for(int k = 0; k < K; k++) {
-					for(int pq = 0; pq < PQ; pq++, index++) {
-						dest[index] = op.fn.execute(dest[index], biasArr[k]);
-					}
-				}
-			}
-		}
-		else {
-			for(int k = 0; k < K; k++) {
-				if( !bias.getSparseBlock().isEmpty(k) ) {
-					int apos = bias.getSparseBlock().pos(k);
-					double[] avals = bias.getSparseBlock().values(k);
-					double val = avals[apos];
-					for(int n = rl; n < ru; n++) {
-						int index = n*K*PQ + k*PQ;
-						for(int pq = 0; pq < PQ; pq++, index++) {
-							dest[index] = op.fn.execute(dest[index], val);
-						}
-					}
-				}
-			}
-		}
-	}
-	
 	public static void fillBias(MatrixBlock bias, double [] outputArray, int src_rl, int src_ru, int N, int K, int PQ) throws DMLRuntimeException {
 		// bias.getNumColumns() == 1 checked outside
 		if(bias.isInSparseFormat()) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
new file mode 100644
index 0000000..c1b4c3e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.utils;
+
+
+/**
+ * This class is useful in setting environment variable for loading MKL library (done by Native Helper)
+ */
+public class EnvironmentHelper {
+	public static native void setEnv(String key, String value);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
new file mode 100644
index 0000000..2b997ed
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.utils;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.HashMap;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.SystemUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
+
+/**
+ * This class helps in loading native library.
+ * By default, it first tries to load Intel MKL, else tries to load OpenBLAS.
+ */
+public class NativeHelper {
+	private static boolean isSystemMLLoaded = false;
+	private static final Log LOG = LogFactory.getLog(NativeHelper.class.getName());
+	private static HashMap<String, String> supportedArchitectures = new HashMap<String, String>();
+	public static String blasType;
+	private static int maxNumThreads = -1;
+	private static boolean setMaxNumThreads = false;
+	static {
+		// Note: we only support 64 bit Java on x86 and AMD machine
+    supportedArchitectures.put("x86_64", "x86_64");
+    supportedArchitectures.put("amd64", "x86_64");
+	}
+	
+	private static boolean attemptedLoading = false;
+	
+	// Performing loading in a method instead of a static block will throw a detailed stack trace in case of fatal errors
+	private static void init() {
+		// Only Linux supported for BLAS
+		if(!SystemUtils.IS_OS_LINUX)
+			return;
+		
+		// attemptedLoading variable ensures that we don't try to load SystemML and other dependencies 
+		// again and again especially in the parfor (hence the double-checking with synchronized).
+		if(!attemptedLoading) {
+			DMLConfig dmlConfig = ConfigurationManager.getDMLConfig();
+			String userSpecifiedBLAS = System.getenv("SYSTEMML_BLAS");
+			userSpecifiedBLAS = (userSpecifiedBLAS == null) ? "" :  userSpecifiedBLAS.trim().toLowerCase();
+			// -------------------------------------------------------------------------------------
+			// We allow BLAS to be enabled or disabled or explicitly selected in one of the two ways:
+			// 1. DML Configuration: native.blas (boolean flag)
+			// 2. Environment variable: SYSTEMML_BLAS (can be set to mkl, openblas or none)
+			// The option 1 will be removed in later SystemML versions.
+			// The option 2 is useful for two reasons:
+			// - Developer testing of different BLAS 
+			// - Provides fine-grained control. Certain machines could use mkl while others use openblas, etc.
+			boolean enabledViaConfig = (dmlConfig == null) ? true : dmlConfig.getBooleanValue(DMLConfig.NATIVE_BLAS);
+			boolean enabledViaEnvironmentVariable = userSpecifiedBLAS.equals("") || userSpecifiedBLAS.equals("mkl") || userSpecifiedBLAS.equals("openblas");
+			
+			if(enabledViaConfig && enabledViaEnvironmentVariable) {
+				long start = System.nanoTime();
+				if(!supportedArchitectures.containsKey(SystemUtils.OS_ARCH)) {
+					LOG.warn("Unsupported architecture for native BLAS:" + SystemUtils.OS_ARCH);
+					return;
+				}
+	    	synchronized(NativeHelper.class) {
+	    		if(!attemptedLoading) {
+	    			// -----------------------------------------------------------------------------
+	    			// =============================================================================
+	    			// By default, we will native.blas=true and we will attempt to load MKL first.
+    				// If MKL is not enabled then we try to load OpenBLAS.
+    				// If both MKL and OpenBLAS are not available we fall back to Java BLAS.
+	    			if(userSpecifiedBLAS.equalsIgnoreCase("")) {
+	    				blasType = isMKLAvailable() ? "mkl" : isOpenBLASAvailable() ? "openblas" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load either MKL or OpenBLAS");
+	    			}
+	    			else if(userSpecifiedBLAS.equalsIgnoreCase("mkl")) {
+	    				blasType = isMKLAvailable() ? "mkl" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load MKL");
+	    			}
+	    			else if(userSpecifiedBLAS.equalsIgnoreCase("openblas")) {
+	    				blasType = isOpenBLASAvailable() ? "openblas" : null;
+	    				if(blasType == null)
+	    					LOG.warn("Unable to load OpenBLAS");
+	    			}
+	    			else {
+	    				LOG.warn("Unsupported BLAS:" + userSpecifiedBLAS);
+	    			}
+	    			// =============================================================================
+				    if(blasType != null && loadLibraryHelper("libsystemml_" + blasType + "-Linux-x86_64.so")) {
+							LOG.info("Using native blas: " + blasType);
+							isSystemMLLoaded = true;
+						}
+	    		}
+	    	}
+	    	double timeToLoadInMilliseconds = (System.nanoTime()-start)*1e-6;
+	    	if(timeToLoadInMilliseconds > 100) 
+	    		LOG.warn("Time to load native blas: " + timeToLoadInMilliseconds + " milliseconds.");
+			}
+			else {
+				if(enabledViaConfig)
+					LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the configuration 'native.blas'.");
+				else
+					LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the environment variable 'SYSTEMML_BLAS=" + userSpecifiedBLAS + "'.");
+			}
+			attemptedLoading = true;
+		}
+	}
+	
+	public static boolean isNativeLibraryLoaded() {
+		init();
+		if(maxNumThreads == -1)
+			maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+		if(isSystemMLLoaded && !setMaxNumThreads && maxNumThreads != -1) {
+			// This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+			// In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. 
+			// We can revisit this decision later and hence I would not recommend removing this method. 
+			setMaxNumThreads(maxNumThreads);
+			setMaxNumThreads = true;
+		}
+		return isSystemMLLoaded;
+	}
+	
+	public static int getMaxNumThreads() {
+		if(maxNumThreads == -1)
+			maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+		return maxNumThreads;
+	}
+	
+	
+	private static boolean isMKLAvailable() {
+		// ------------------------------------------------------------
+		// Set environment variable MKL_THREADING_LAYER to GNU on Linux for performance
+		if(!loadLibraryHelper("libpreload_systemml-Linux-x86_64.so")) {
+			LOG.warn("Unable to load preload_systemml (required for loading MKL-enabled SystemML library)");
+			return false;
+		}
+		// The most reliable way in my investigation to ensure that MKL runs smoothly with OpenMP (used by conv2d*)
+		// is setting the environment variable MKL_THREADING_LAYER to GNU
+		EnvironmentHelper.setEnv("MKL_THREADING_LAYER", "GNU");
+		if(!loadBLAS("gomp", "gomp required for loading MKL-enabled SystemML library")) 
+			return false;
+		
+		// ------------------------------------------------------------
+		return loadBLAS("mkl_rt", null);
+	}
+	
+	private static boolean isOpenBLASAvailable() {
+		if(!loadBLAS("gomp", "gomp required for loading OpenBLAS-enabled SystemML library")) 
+			return false;
+		return loadBLAS("openblas", null);
+	}
+	
+	private static boolean loadBLAS(String blas, String optionalMsg) {
+		try {
+			 System.loadLibrary(blas);
+			 return true;
+		}
+		catch (UnsatisfiedLinkError e) {
+			if(optionalMsg != null)
+				LOG.warn("Unable to load " + blas + "(" + optionalMsg + "):" + e.getMessage());
+			else
+				LOG.warn("Unable to load " + blas + ":" + e.getMessage());
+			return false;
+		}
+	}
+
+	private static boolean loadLibraryHelper(String path)  {
+		InputStream in = null; OutputStream out = null;
+		try {
+			// This logic is added because Java doesnot allow to load library from a resource file.
+			in = NativeHelper.class.getResourceAsStream("/lib/"+path);
+			if(in != null) {
+				File temp = File.createTempFile(path, "");
+				temp.deleteOnExit();
+				out = FileUtils.openOutputStream(temp);
+        IOUtils.copy(in, out);
+        in.close(); in = null;
+        out.close(); out = null;
+				System.load(temp.getAbsolutePath());
+				return true;
+			}
+			else
+				LOG.warn("No lib available in the jar:" + path);
+		} catch(IOException e) {
+			LOG.warn("Unable to load library " + path + " from resource:" + e.getMessage());
+		} finally {
+			if(out != null)
+				try {
+					out.close();
+				} catch (IOException e) {}
+			if(in != null)
+				try {
+					in.close();
+				} catch (IOException e) {}
+		}
+		return false;
+	}
+	
+	// TODO: Add pmm, wsloss, mmchain, etc.
+	public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+	private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
+	
+	// ----------------------------------------------------------------------------------------------------------------
+	// LibMatrixDNN operations:
+	// N = number of images, C = number of channels, H = image height, W = image width
+	// K = number of filters, R = filter height, S = filter width
+	// TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority.
+	
+	// Called by ConvolutionCPInstruction if both input and filter are dense
+	public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// Called by ConvolutionCPInstruction if both input and filter are dense
+	public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// If both filter and dout are dense, then called by ConvolutionCPInstruction
+	// Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
+	public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	
+	// Currently only supported with numThreads = 1 and sparse input
+	// Called by LibMatrixDNN's thread if input is sparse. dout[n] is converted to dense if sparse.
+	public static native boolean conv2dBackwardFilterSparseDense(int apos, int alen, int[] aix, double[] avals, double [] rotatedDoutPtr, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// Called by LibMatrixDNN's thread if input is sparse and filter is dense
+	public static native boolean conv2dSparse(int apos, int alen, int[] aix, double[] avals, double [] filter, double [] ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+	// ----------------------------------------------------------------------------------------------------------------
+	
+	// This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+	// In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. 
+	// We can revisit this decision later and hence I would not recommend removing this method. 
+	private static native void setMaxNumThreads(int numThreads);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java
index 097f58e..97888cb 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -112,6 +112,17 @@ public class Statistics
 		return numExecutedMRJobs.longValue();
 	}
 	
+	private static LongAdder numNativeFailures = new LongAdder();
+	public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
+	public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+	public static void incrementNativeFailuresCounter() {
+		numNativeFailures.increment();
+		// This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case.
+		// Note: all the native calls have a fallback to Java; so if the user wants she can recompile SystemML by 
+		// commenting this exception and everything should work fine.
+		throw new RuntimeException("Unexpected ERROR: OOM caused during JNI transfer. Please disable native BLAS by setting enviroment variable: SYSTEMML_BLAS=none");
+	}
+	
 	public static void incrementNoOfExecutedMRJobs() {
 		numExecutedMRJobs.increment();
 	}
@@ -366,6 +377,9 @@ public class Statistics
 		resetCPHeavyHitters();
 
 		GPUStatistics.reset();
+		numNativeLibMatrixMultCalls.reset();
+		numNativeLibMatrixDNNCalls.reset();
+		numNativeFailures.reset();
 		LibMatrixDNN.resetStatistics();
 	}
 
@@ -621,6 +635,11 @@ public class Statistics
 		//show extended caching/compilation statistics
 		if( DMLScript.STATISTICS ) 
 		{
+			if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 || 
+					numNativeLibMatrixDNNCalls.longValue() > 0)) {
+				String blas = NativeHelper.blasType != null ? NativeHelper.blasType : ""; 
+				sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue()  + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+			}
 			sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n");
 			sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n");
 			sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n");



[3/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

Posted by ni...@apache.org.
[SYSTEMML-769] Adding support for native BLAS for Linux

- Both MKL and OpenBLAS show 2-3x performance benefits on conv2d
  operators on Lenet.
- There are several OpenMP related issues on Mac, hence we have explicitly
  disabled native support for Mac and Windows. Since we have already have
  cmake setup in place, we can always support BLAS on Mac and Windows in
  future when OpenMP related issues are resolved.

Closes #344.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/39a37ae4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/39a37ae4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/39a37ae4

Branch: refs/heads/master
Commit: 39a37ae40c84d934236a7748252b196fb0a8b08d
Parents: 4b6d468
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Sun Apr 30 10:16:23 2017 -0800
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Sun Apr 30 11:16:23 2017 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   3 +
 docs/native-backend.md                          | 204 ++++++++++
 docs/troubleshooting-guide.md                   |   6 +-
 pom.xml                                         |   4 +
 src/assembly/source.xml                         |   1 +
 src/main/cpp/CMakeLists.txt                     |  76 ++++
 src/main/cpp/check-dependency-linux-x86_64.sh   |  45 +++
 src/main/cpp/cmake/FindMKL.cmake                | 132 +++++++
 src/main/cpp/cmake/FindOpenBLAS.cmake           |  79 ++++
 src/main/cpp/config.h.cmake                     |  30 ++
 .../cpp/lib/libpreload_systemml-Linux-x86_64.so | Bin 0 -> 7976 bytes
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 0 -> 27408 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 0 -> 27416 bytes
 src/main/cpp/libmatrixdnn.cpp                   | 333 ++++++++++++++++
 src/main/cpp/libmatrixdnn.h                     |  38 ++
 src/main/cpp/libmatrixmult.cpp                  |  63 +++
 src/main/cpp/libmatrixmult.h                    |  62 +++
 src/main/cpp/preload/preload_systemml.cpp       |  35 ++
 src/main/cpp/preload/preload_systemml.h         |  40 ++
 src/main/cpp/systemml.cpp                       | 225 +++++++++++
 src/main/cpp/systemml.h                         | 106 +++++
 .../java/org/apache/sysml/api/DMLScript.java    |   3 +-
 .../java/org/apache/sysml/conf/DMLConfig.java   |   4 +-
 .../org/apache/sysml/hops/ConvolutionOp.java    |   9 +-
 .../apache/sysml/lops/ConvolutionTransform.java |   5 +-
 .../instructions/CPInstructionParser.java       |   1 +
 .../instructions/GPUInstructionParser.java      |   1 -
 .../cp/AggregateBinaryCPInstruction.java        |  14 +-
 .../cp/ConvolutionCPInstruction.java            |  49 ++-
 .../gpu/ConvolutionGPUInstruction.java          |   9 +-
 .../spark/ConvolutionSPInstruction.java         |  21 +-
 .../matrix/data/ConvolutionParameters.java      |   2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  35 --
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 387 +++++++++++++------
 .../runtime/matrix/data/LibMatrixMult.java      |  13 +-
 .../runtime/matrix/data/LibMatrixNative.java    | 200 ++++++++++
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  24 +-
 .../sysml/runtime/util/ConvolutionUtils.java    | 116 +++---
 .../apache/sysml/utils/EnvironmentHelper.java   |  27 ++
 .../org/apache/sysml/utils/NativeHelper.java    | 259 +++++++++++++
 .../java/org/apache/sysml/utils/Statistics.java |  19 +
 .../test/integration/AutomatedTestBase.java     |   5 +-
 .../functions/tensor/Conv2DTest.java            |  25 +-
 .../functions/tensor/Conv2DBackwardDataTest.R   |   6 +-
 .../functions/tensor/Conv2DBackwardDataTest.dml |   8 +-
 .../functions/tensor/Conv2DBackwardTest.R       |   6 +-
 .../functions/tensor/Conv2DBackwardTest.dml     |   8 +-
 src/test/scripts/functions/tensor/Conv2DTest.R  |   6 +-
 .../scripts/functions/tensor/Conv2DTest.dml     |   8 +-
 src/test/scripts/functions/tensor/PoolTest.R    |   3 +-
 src/test/scripts/functions/tensor/PoolTest.dml  |   4 +-
 51 files changed, 2469 insertions(+), 290 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index fe4437f..0ccf7da 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -65,6 +65,9 @@
    
    <!-- if codegen.enabled, compile literals as constants: 1..heuristic, 2..always -->
    <codegen.literals>1</codegen.literals>
+   
+   <!-- enables native blas for matrix multiplication and convolution, experimental feature -->
+   <native.blas>true</native.blas>
 
    <!-- prints extra statistics information for GPU -->
    <systemml.stats.extraGPU>false</systemml.stats.extraGPU>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/docs/native-backend.md
----------------------------------------------------------------------
diff --git a/docs/native-backend.md b/docs/native-backend.md
new file mode 100644
index 0000000..86a1340
--- /dev/null
+++ b/docs/native-backend.md
@@ -0,0 +1,204 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# User Guide
+
+By default, SystemML implements all its matrix operations in Java.
+This simplifies deployment especially in a distributed environment.
+
+In some cases (such as deep learning), the user might want to use native BLAS
+rather than SystemML's internal Java library for performing single-node
+operations such matrix multiplication, convolution, etc.
+By default, SystemML will first attempt to use Intel MKL (if installed)
+and then OpenBLAS (if installed).
+If both Intel MKL and OpenBLAS are not available, SystemML
+falls back to its internal Java library.
+
+To force SystemML to use internal Java library rather than native BLAS,
+please set the configuration property `native.blas` to `false`.
+
+The current version of SystemML only supports BLAS on Linux machines.
+
+
+## Step 1: Install BLAS
+
+### Option 1: Install Intel MKL (recommended)
+
+Download and install the [community version of Intel MKL](https://software.intel.com/sites/campaigns/nest/).
+Intel requires you to first register your email address and then sends the download link to your email address
+with license key.
+
+* Linux users will have to extract the downloaded `.tgz` file, execute `install.sh` and follow the guided setup.
+
+### Option 2: Install OpenBLAS  
+
+```bash
+# The default OpenBLAS (via yum/apt-get) uses its internal threading rather than OpenMP, 
+# which can lead to performance degradation when using SystemML. So, instead we recommend that you
+# compile OpenBLAS from the source. 
+# RedHat / CentOS: sudo yum install openblas
+# Ubuntu: sudo apt-get install openblas
+git clone https://github.com/xianyi/OpenBLAS.git
+cd OpenBLAS/
+make clean
+make USE_OPENMP=1
+sudo make install
+# After installation, you may also want to add `/opt/OpenBLAS/lib` to your LD_LIBRARY_PATH or `java.library.path`.
+```
+
+You can check if the OpenBLAS on you system is compiled with OpenMP or not using following commands:
+
+```bash
+$ ldconfig -p | grep libopenblas.so
+libopenblas.so (libc6,x86-64) => /opt/OpenBLAS/lib/libopenblas.so
+$ ldd /opt/OpenBLAS/lib/libopenblas.so | grep libgomp
+libgomp.so.1 => /lib64/libgomp.so.1
+```
+
+If you don't see any output after the second command, then OpenBLAS installed on your system is using its internal threading.
+In this case, we highly recommend that you reinstall OpenBLAS using the above commands.
+
+
+## Step 2: Install other dependencies
+
+```bash
+# Centos/RedHat
+sudo yum install gcc-c++
+# Ubuntu
+sudo apt-get install g++ 
+```
+
+We also depend on GNU OpenMP (gomp) which will be installed by GCC.
+To find the location of `gomp` on your system, please use the command `ldconfig -p | grep libgomp`.
+If gomp is available as `/lib64/libgomp.so.1` instead of `/lib64/libgomp.so`,
+please add a softlink to it:
+
+```bash
+sudo ln -s /lib64/libgomp.so.1 /lib64/libgomp.so
+```
+	
+## Step 3: Provide the location of the native libraries
+
+1. Add the location of the native libraries (i.e. BLAS and other dependencies) 
+to the environment variable `LD_LIBRARY_PATH` (on Linux). 
+If you want to use SystemML with Spark, please add the following line to `spark-env.sh`
+
+	```bash
+	export LD_LIBRARY_PATH=/path/to/blas-n-other-dependencies
+	# Or export SPARK_LIBRARY_PATH=/path/to/blas-n-other-dependencies
+	```
+
+2. Alternatively, you can pass the location of the native libraries using command-line options:
+
+- Java: `-Djava.library.path=/path/to/blas-n-other-dependencies`
+- [Spark](http://spark.apache.org/docs/latest/configuration.html): `--driver-library-path`
+
+## Common issues on Linux
+
+1. Unable to load `gomp`
+
+First make sure if gomp is available on your system.
+
+	```bash
+	ldconfig -p | grep  libgomp
+	```
+
+If the above command returns no results, then you may have to install `gcc`.
+On the other hand, if the above command only returns libgomp with major suffix (such as `so.1`),
+then please execute the below command:
+
+	```bash
+	sudo ln -s /lib64/libgomp.so.1 /usr/lib64/libgomp.so
+	```
+
+2. Unable to load `mkl_rt`
+
+By default, Intel MKL libraries will be installed in the location `/opt/intel/mkl/lib/intel64/`.
+Make sure that this path is accessible to Java as per instructions provided in the above section.
+
+3. Unable to load `openblas`
+
+By default, OpenBLAS libraries will be installed in the location `/opt/OpenBLAS/lib/`.
+Make sure that this path is accessible to Java as per instructions provided in the above section.
+
+# Developer Guide
+
+This section describes how to compile shared libraries in the folder `src/main/cpp/lib`. 
+This is required when the developer makes changes to cpp directory or while validating the source package during the release process.
+
+To force SystemML to use OpenBLAS instead of Intel MKL if both are installed,
+please set the environment variable `SYSTEMML_BLAS` to `openblas`.
+This environment variable is used internally for testing and is not required for users.
+
+## Intro to CMake
+If you are familiar with cmake, skip this section.
+
+In a regular project with a Makefile, the compiled object files are placed in the same directory as the source.
+Sometimes we don't want to pollute the source tree. We might also want to have different binaries for different configurations. For instance, if we want to link a binary with separate libraries.
+CMake supports out of source tree builds. As an illustration, you can create a directory called "BUILD" and invoke cmake like so : `cmake <path/to/source>`. The makefile and other config files are placed in this "BUILD" directory. You can now say `make` and the compiled objects and binary files are created in this directory. You can then create another "BUILD2" directory and repeat the process.
+You can pass options to cmake as well. In this instance, it might be to specify whether to build with Intel MKL or OpenBLAS. This can be done from the command line with a "-D" appended to it, but more interestingly, it can also be done form a n-curses GUI which is invoked as `ccmake <path/to/source>`. (You may need to install this separately).
+Also, the C, C++ compilers and their flags are picked up by cmake when set in standard environment variables. These are respectively `CC`, `CXX`, `CFLAGS` & `CXFLAGS`. As an example, they may be specified as:
+
+	CXX=gcc-6 cmake ..
+
+For this project, I typically make a directory in the `cpp` folder (this folder) and name it the config I use. For instance, `INTEL` for Intel MKL and `OPENBLAS` for OpenBLAS.
+
+1. Install `g++`, OpenBLAS and MKL using the above instructions
+
+2. Set `JAVA_HOME` to JDK.
+
+	export JAVA_HOME=<path to JDK 1.8>
+
+3. Install cmake
+
+	```bash
+	# Centos/RedHat
+	sudo yum install cmake3
+	# Ubuntu
+	sudo apt-get install cmake
+	```
+
+4. Compile the libs using the below script. 
+
+	```bash
+	mkdir INTEL && cd INTEL
+	cmake -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64" ..
+	make install
+	cd ..
+	mkdir OPENBLAS && cd OPENBLAS
+	cmake -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64" ..
+	make install
+	cd ..
+	# The below script helps maintain this document as well as avoid accidental inclusion of non-standard dependencies.
+	./check-dependency-linux-x86_64.sh
+	```
+
+
+The generated library files are placed in src/main/cpp/lib. This location can be changed from the CMakeLists.txt file.
+
+The above script also validates whether additional dependencies have been added while compiling and warns the developer.  
+The current set of dependencies other than MKL and OpenBLAS, are as follows:
+
+- GNU Standard C++ Library: `libstdc++.so.6`
+- GCC version 4.8 shared support library: `libgcc_s.so.1`
+- The GNU libc libraries: `libm.so.6, libdl.so.2, libc.so.6, libpthread.so.0`
+- GCC OpenMP v3.0 shared support library: `libgomp.so.1`
+- Additional OpenBLAS dependencies: Fortran runtime (`libgfortran.so.3`) and GCC `__float128` shared support library (`libquadmath.so.0`)
+
+If CMake cannot detect your OpenBLAS installation, set the `OpenBLAS_HOME` environment variable to the OpenBLAS Home.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/docs/troubleshooting-guide.md
----------------------------------------------------------------------
diff --git a/docs/troubleshooting-guide.md b/docs/troubleshooting-guide.md
index 4731f51..e80cad4 100644
--- a/docs/troubleshooting-guide.md
+++ b/docs/troubleshooting-guide.md
@@ -135,4 +135,8 @@ by setting the configuration `systemml.stats.extraGPU` and `systemml.stats.extra
 
 Out-Of-Memory on executors is often caused due to side-effects of lazy evaluation and in-memory input data of Spark for large-scale problems. 
 Though we are constantly improving our optimizer to address this scenario, a quick hack to resolve this is reducing the number of cores allocated to the executor.
-We would highly appreciate if you file a bug report on our [issue tracker](https://issues.apache.org/jira/browse/SYSTEMML) if and when you encounter OOM.
\ No newline at end of file
+We would highly appreciate if you file a bug report on our [issue tracker](https://issues.apache.org/jira/browse/SYSTEMML) if and when you encounter OOM.
+
+## Native BLAS errors
+
+Please see [the user guide of native backend](http://apache.github.io/incubator-systemml/native-backend).
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index f4f6016..9a33853 100644
--- a/pom.xml
+++ b/pom.xml
@@ -111,6 +111,10 @@
 			</excludes>
 			<targetPath>kernels</targetPath>
 		</resource>
+		<resource>
+			<directory>src/main/cpp/lib</directory>
+			<targetPath>lib</targetPath>
+		</resource>
 	</resources>
 
 		<plugins>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/assembly/source.xml
----------------------------------------------------------------------
diff --git a/src/assembly/source.xml b/src/assembly/source.xml
index 8831dd9..4acb5b3 100644
--- a/src/assembly/source.xml
+++ b/src/assembly/source.xml
@@ -51,6 +51,7 @@
 				<exclude>**/temp/**/*</exclude>
 				<!--SYSTEML-479 -->
 				<exclude>./src/test/config/hadoop_bin_windows/**</exclude>
+				<exclude>./src/main/cpp/lib/**</exclude>
 			</excludes>
 		</fileSet>
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000..c492959
--- /dev/null
+++ b/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+cmake_minimum_required(VERSION 2.8)
+
+project (systemml)
+
+# All custom find modules
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
+
+# OpenMP is required
+find_package(OpenMP REQUIRED)
+
+# Options to Use OpenBLAS or Intel MKL
+option(USE_OPEN_BLAS "Whether to use OpenBLAS (Defaults to compiling with Intel MKL, if both set, MKL has priority)" OFF)
+option(USE_INTEL_MKL "Whether to use Intel MKL (Defaults to compiling with Intel MKL)" ON)
+
+# Build a shared libraray
+add_library(systemml SHARED libmatrixdnn.cpp  libmatrixmult.cpp  systemml.cpp)
+add_library(preload SHARED preload/preload_systemml.cpp)
+
+set(MATH_LIBRARIES "")
+
+# sets the installation path to src/main/cpp/lib
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR})
+install(TARGETS systemml preload LIBRARY DESTINATION lib)
+
+set(CMAKE_BUILD_TYPE Release)
+
+set_target_properties(preload PROPERTIES OUTPUT_NAME "preload_systemml-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+
+if (USE_OPEN_BLAS)
+  find_package(OpenBLAS REQUIRED)
+  # sets the name of the output to include the os and the architecture
+  set_target_properties(systemml PROPERTIES OUTPUT_NAME "systemml_openblas-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+  include_directories(${OpenBLAS_INCLUDE_DIR})
+  set(MATH_LIBRARIES "${OpenBLAS_LIB}")
+elseif(USE_INTEL_MKL)
+  find_package(MKL REQUIRED)
+  # sets the name of the output to include the os and the architecture
+  set_target_properties(systemml PROPERTIES OUTPUT_NAME "systemml_mkl-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+  include_directories(${MKL_INCLUDE_DIR})
+  set(MATH_LIBRARIES "${MKL_LIBRARIES}")
+endif()
+
+# Option written to a config.h file
+configure_file(${CMAKE_SOURCE_DIR}/config.h.cmake ${CMAKE_BINARY_DIR}/config.h)
+
+# Include directories. (added for Linux & Darwin, fix later for windows)
+# include paths can be spurious
+include_directories($ENV{JAVA_HOME}/include/)
+include_directories($ENV{JAVA_HOME}/include/darwin)
+include_directories($ENV{JAVA_HOME}/include/linux)
+include_directories($ENV{JAVA_HOME}/include/win32)
+
+# Include the binary dir which contains "config.h"
+include_directories(${CMAKE_BINARY_DIR})
+
+
+# Setting CXX compiler flags
+set_target_properties(systemml PROPERTIES LINK_FLAGS "${OpenMP_CXX_FLAGS} ${MATH_LIBRARIES}")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/check-dependency-linux-x86_64.sh
----------------------------------------------------------------------
diff --git a/src/main/cpp/check-dependency-linux-x86_64.sh b/src/main/cpp/check-dependency-linux-x86_64.sh
new file mode 100755
index 0000000..40f0bb0
--- /dev/null
+++ b/src/main/cpp/check-dependency-linux-x86_64.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# This shell script compiles the required shared libraries for 64-bit Linux on x86 machine
+
+# yum whatprovides libgcc_s.so.1
+# GNU Standard C++ Library: libstdc++.so.6
+# GCC version 4.8 shared support library: libgcc_s.so.1
+# The GNU libc libraries: libm.so.6, libdl.so.2, libc.so.6, libpthread.so.0
+# GCC OpenMP v3.0 shared support library: libgomp.so.1
+gcc_toolkit="libgcc_s.so\|libm.so\|libstdc++\|libc.so\|libdl.so\|libgomp.so\|libpthread.so"
+linux_loader="linux-vdso.so\|ld-linux-x86-64.so"
+intel_mkl="libmkl_rt.so"
+
+# Fortran runtime: libgfortran.so.3
+# GCC __float128 shared support library: libquadmath.so.0
+openblas="libopenblas.so\|libgfortran.so\|libquadmath.so"
+
+echo "-----------------------------------------------------------------------"
+echo "Check for unexpected dependencies added after code change or new setup:"
+echo "Non-standard dependencies for libpreload_systemml-linux-x86_64.so"
+ldd lib/libpreload_systemml-Linux-x86_64.so | grep -v $gcc_toolkit"\|"$linux_loader
+echo "Non-standard dependencies for libsystemml_mkl-linux-x86_64.so"
+ldd lib/libsystemml_mkl-Linux-x86_64.so | grep -v $gcc_toolkit"\|"$linux_loader"\|"$intel_mkl
+echo "Non-standard dependencies for libsystemml_openblas-linux-x86_64.so"
+ldd lib/libsystemml_openblas-Linux-x86_64.so | grep -v $gcc_toolkit"\|"$linux_loader"\|"$openblas
+echo "-----------------------------------------------------------------------"

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/cmake/FindMKL.cmake
----------------------------------------------------------------------
diff --git a/src/main/cpp/cmake/FindMKL.cmake b/src/main/cpp/cmake/FindMKL.cmake
new file mode 100644
index 0000000..f2a50eb
--- /dev/null
+++ b/src/main/cpp/cmake/FindMKL.cmake
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Find the MKL libraries
+#
+# Options:
+#
+#   MKL_USE_SINGLE_DYNAMIC_LIBRARY  : use single dynamic library interface
+#   MKL_USE_STATIC_LIBS             : use static libraries
+#   MKL_MULTI_THREADED              : use multi-threading
+#
+# This module defines the following variables:
+#
+#   MKL_FOUND            : True mkl is found
+#   MKL_INCLUDE_DIR      : unclude directory
+#   MKL_LIBRARIES        : the libraries to link against.
+
+
+# ---[ Options
+option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
+option(MKL_USE_STATIC_LIBS "Use static libraries" OFF)
+option(MKL_MULTI_THREADED  "Use multi-threading"   ON)
+
+# ---[ Root folders
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKLROOT} ${INTEL_ROOT}/mkl
+                                   DOC "Folder contains MKL")
+
+# ---[ Find include dir
+find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
+set(__looked_for MKL_INCLUDE_DIR)
+
+# ---[ Find libraries
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+  set(__path_suffixes lib lib/ia32)
+else()
+  set(__path_suffixes lib lib/intel64)
+endif()
+
+set(__mkl_libs "")
+if(MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+  list(APPEND __mkl_libs rt)
+else()
+  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    if(WIN32)
+      list(APPEND __mkl_libs intel_c)
+    else()
+      list(APPEND __mkl_libs intel gf)
+    endif()
+  else()
+    list(APPEND __mkl_libs intel_lp64 gf_lp64)
+  endif()
+
+  if(MKL_MULTI_THREADED)
+    list(APPEND __mkl_libs intel_thread)
+  else()
+     list(APPEND __mkl_libs sequential)
+  endif()
+
+  list(APPEND __mkl_libs core cdft_core)
+endif()
+
+
+foreach (__lib ${__mkl_libs})
+  set(__mkl_lib "mkl_${__lib}")
+  string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
+
+  if(MKL_USE_STATIC_LIBS)
+    set(__mkl_lib "lib${__mkl_lib}.a")
+  endif()
+
+  find_library(${__mkl_lib_upper}_LIBRARY
+        NAMES ${__mkl_lib}
+        PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
+        PATH_SUFFIXES ${__path_suffixes}
+        DOC "The path to Intel(R) MKL ${__mkl_lib} library")
+  mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
+
+  list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
+  list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
+endforeach()
+
+
+if(NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+  if (MKL_USE_STATIC_LIBS)
+    set(__iomp5_libs iomp5 libiomp5mt.lib)
+  else()
+    set(__iomp5_libs iomp5 libiomp5md.lib)
+  endif()
+
+  if(WIN32)
+    find_path(INTEL_INCLUDE_DIR omp.h PATHS ${INTEL_ROOT} PATH_SUFFIXES include)
+    list(APPEND __looked_for INTEL_INCLUDE_DIR)
+  endif()
+
+  find_library(MKL_RTL_LIBRARY ${__iomp5_libs}
+     PATHS ${INTEL_RTL_ROOT} ${INTEL_ROOT}/compiler ${MKL_ROOT}/.. ${MKL_ROOT}/../compiler
+     PATH_SUFFIXES ${__path_suffixes}
+     DOC "Path to Path to OpenMP runtime library")
+
+  list(APPEND __looked_for MKL_RTL_LIBRARY)
+  list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
+endif()
+
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
+
+if(MKL_FOUND)
+  message(STATUS "Found MKL (include: ${MKL_INCLUDE_DIR}, lib: ${MKL_LIBRARIES}")
+endif()
+
+unset(__looked_for)
+unset(__mkl_libs)
+unset(__path_suffixes)
+unset(__lib_suffix)
+unset(__iomp5_libs)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/cmake/FindOpenBLAS.cmake
----------------------------------------------------------------------
diff --git a/src/main/cpp/cmake/FindOpenBLAS.cmake b/src/main/cpp/cmake/FindOpenBLAS.cmake
new file mode 100644
index 0000000..bd6f2aa
--- /dev/null
+++ b/src/main/cpp/cmake/FindOpenBLAS.cmake
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+SET(Open_BLAS_INCLUDE_SEARCH_PATHS
+  /usr/include
+  /usr/include/openblas
+  /usr/include/openblas-base
+  /usr/local/include
+  /usr/local/include/openblas
+  /usr/local/include/openblas-base
+  /opt/OpenBLAS/include
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_HOME}/include
+)
+
+SET(Open_BLAS_LIB_SEARCH_PATHS
+        /lib/
+        /lib/openblas-base
+        /lib64/
+        /usr/lib
+        /usr/lib/openblas-base
+        /usr/lib64
+        /usr/local/lib
+        /usr/local/lib64
+        /opt/OpenBLAS/lib
+        $ENV{OpenBLAS}cd
+        $ENV{OpenBLAS}/lib
+        $ENV{OpenBLAS_HOME}
+        $ENV{OpenBLAS_HOME}/lib
+ )
+
+FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
+FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
+
+SET(OpenBLAS_FOUND ON)
+
+#    Check include files
+IF(NOT OpenBLAS_INCLUDE_DIR)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+#    Check libraries
+IF(NOT OpenBLAS_LIB)
+    SET(OpenBLAS_FOUND OFF)
+    MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
+ENDIF()
+
+IF (OpenBLAS_FOUND)
+  IF (NOT OpenBLAS_FIND_QUIETLY)
+    MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
+    MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
+  ENDIF (NOT OpenBLAS_FIND_QUIETLY)
+ELSE (OpenBLAS_FOUND)
+  IF (OpenBLAS_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
+  ENDIF (OpenBLAS_FIND_REQUIRED)
+ENDIF (OpenBLAS_FOUND)
+
+MARK_AS_ADVANCED(
+    OpenBLAS_INCLUDE_DIR
+    OpenBLAS_LIB
+    OpenBLAS
+)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/config.h.cmake
----------------------------------------------------------------------
diff --git a/src/main/cpp/config.h.cmake b/src/main/cpp/config.h.cmake
new file mode 100644
index 0000000..e3f9407
--- /dev/null
+++ b/src/main/cpp/config.h.cmake
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* Whether to use Open Blas */
+#cmakedefine USE_OPEN_BLAS
+
+/* Whether to use Intel MKL */
+#cmakedefine USE_INTEL_MKL
+
+
+#endif // CONFIG_H

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/lib/libpreload_systemml-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libpreload_systemml-Linux-x86_64.so b/src/main/cpp/lib/libpreload_systemml-Linux-x86_64.so
new file mode 100755
index 0000000..07e89be
Binary files /dev/null and b/src/main/cpp/lib/libpreload_systemml-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
new file mode 100755
index 0000000..0a6427a
Binary files /dev/null and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
new file mode 100755
index 0000000..ffdcd5a
Binary files /dev/null and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/libmatrixdnn.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
new file mode 100644
index 0000000..a521804
--- /dev/null
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -0,0 +1,333 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "config.h"
+#include "libmatrixmult.h"
+#include "libmatrixdnn.h"
+#include <cstdlib>
+#include <iostream>
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include "omp.h"
+
+void rotate180(double* inputArray, double* outputArray, int N, int C, int H, int W,
+            int K, int R, int S, int stride_h, int stride_w, int pad_h,
+            int pad_w, int P, int Q) {
+    int PQ = P*Q;
+    int KQ = K*Q;
+	for (int k = 0; k < K; k++) {
+		for (int p = 0; p < P; p++) {
+			for (int q = 0; q < Q; q++) {
+				outputArray[p*KQ + q*K + k] = inputArray[k*PQ + p*Q + q];
+			}
+		}
+	}
+}
+
+void col2im(double* inputArray, double* outputArray, int N, int C, int H, int W,
+            int K, int R, int S, int stride_h, int stride_w, int pad_h,
+            int pad_w, int P, int Q) {
+	for (int p = 0; p < P; p++) {
+		// h = p*stride_h + r - pad_h
+		//   = r + hOffset
+		// Based on restrictions: h >= 0 and r >= 0 and h < H and r < R, we get
+		// max(0, - hOffset) <= r < min(R, H - hOffset)
+		int hOffset = p*stride_h - pad_h;
+		int rStart = MAX(0, - hOffset);
+		int rEnd = MIN(R, H - hOffset);
+		for (int q = 0; q < Q; q++) {
+			// Using the same logic as above on following:
+			// w = q*stride_w + s - pad_w
+			int wOffset = q*stride_w - pad_w;
+			int sStart = MAX(0, - wOffset);
+			int sEnd = MIN(S, W - wOffset);
+			int tempOffset = (p*Q + q)*C*R*S;
+			for (int c = 0; c < C; c++) {
+				int outOffset = c*H*W;
+				int inputOffset = tempOffset + c*R*S;
+				for (int r = rStart; r < rEnd; r++) {
+					for (int s = sStart; s < sEnd; s++) {
+						int inputIndex = inputOffset + r*S + s;
+						int outIndex = outOffset + (hOffset + r)*W + wOffset + s;
+						outputArray[outIndex] += inputArray[inputIndex];
+					}
+				}
+			}
+		}
+	}
+}
+
+void im2col(double* inputArray, double* outputArray, int N, int C, int H, int W,
+            int K, int R, int S, int stride_h, int stride_w, int pad_h,
+            int pad_w, int P, int Q) {
+  int CRS = C * R * S;
+  std::size_t size = Q * sizeof(double);
+  if (stride_h == 1 && stride_w == 1 && pad_h == 0 && pad_w == 0) {
+    for (int c = 0; c < CRS; ++c) {
+      int wOffset = c % S;
+      int hOffset = (c / S) % R;
+      int cInput = c / R / S;
+      for (int h = 0; h < P; ++h) {
+        int hPadded = h + hOffset;
+        int outOffset = (c * P + h) * Q;
+        int inputOffset = (cInput * H + hPadded) * W;
+        std::memcpy(outputArray + outOffset, inputArray + inputOffset + wOffset,
+                    size);
+        int w = Q - 1;
+        int wPadded = w + wOffset;
+        if (hPadded < H && wPadded < W)
+          outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
+        else
+          outputArray[outOffset + w] = 0;
+      }
+    }
+  } else {
+    for (int c = 0; c < CRS; ++c) {
+      int wOffset = c % S;
+      int hOffset = (c / S) % R;
+      int cInput = c / R / S;
+      for (int h = 0; h < P; ++h) {
+        int outOffset = (c * P + h) * Q;
+        int hPadded = h * stride_h - pad_h + hOffset;
+        int inputOffset = (cInput * H + hPadded) * W;
+        if (hPadded < 0 || hPadded >= H) {
+          std::memset(outputArray + outOffset, 0, size);
+        } else {
+          for (int w = 0; w < Q; ++w) {
+            int wPadded = w * stride_w - pad_w + wOffset;
+            if (wPadded >= 0 && wPadded < W)
+              outputArray[outOffset + w] = inputArray[inputOffset + wPadded];
+            else
+              outputArray[outOffset + w] = 0;
+          }
+        }
+      }
+    }
+  }
+} 
+
+
+void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
+  // First step: Avoids oversubscription and other openmp/internal blas threading issues
+  setNumThreadsForBLAS(1);
+  
+  int CHW = C * H * W;
+  int CRS = C*R*S;
+  int PQ = P*Q;
+  int KPQ = K*PQ;
+  int numRotatedElem = KPQ;
+  int numIm2ColElem = CRS * PQ;
+  int numTempElem = CRS * K;
+  
+  int m1 = CRS;
+  int n1 = K;
+  int k1 = PQ;
+  
+  // Allocate temporary data structures used in parallel for
+  int numOpenMPThreads = MIN(numThreads, N);
+  double* temp = new double[numTempElem*numOpenMPThreads];
+  std::memset(temp, 0, numTempElem*numOpenMPThreads*sizeof(double));
+  double* rotatedDoutPtrArrays = new double[numRotatedElem*numOpenMPThreads];
+  double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads];
+
+#pragma omp parallel for num_threads(numOpenMPThreads)
+  for (int n = 0; n < N; n++) {
+  	double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num();
+
+    // Step 1: Perform im2col
+    im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
+           R, S, stride_h, stride_w, pad_h, pad_w,
+           P, Q);
+           
+    // Step 2: Rotate dout
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num();
+    rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
+           R, S, stride_h, stride_w, pad_h, pad_w,
+           P, Q);
+    
+    // Multiply to get CRS X K
+    double* temp1 = temp + numTempElem*omp_get_thread_num();
+    // Step 3: loweredMat (CRS X PQ) %*% rotated_dout (PQ X K) 
+    matmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1);
+              
+  } // end omp parallel for
+  
+  // Inplace transpose addition
+  int numRow = CRS;
+  for(int t = 0; t < numOpenMPThreads; t++) {
+  	int iter = 0;
+  	double* temp1 = temp + numTempElem*t;
+	for(int i = 0; i < CRS; i++) {
+		for(int j = 0; j < K; j++, iter++) {
+			int index = j*numRow+i;
+			retPtr[index] += temp1[iter];
+		}
+	}
+  }
+  
+  delete [] temp;
+  delete [] loweredMatArrays;
+  delete [] rotatedDoutPtrArrays;
+}
+
+void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
+   // First step: Avoids oversubscription and other openmp/internal blas threading issues
+  setNumThreadsForBLAS(1);
+  
+  int CRS = C * R * S;
+  int CHW = C * H * W;
+  int PQ = P * Q;
+  int KPQ = K * PQ;
+  int numRotatedElem = PQ * K;
+  int numCol2ImElem = PQ * CRS;
+
+  // Allocate temporary data structures used in parallel for
+  int numOpenMPThreads = MIN(numThreads, N);
+  double* rotatedDoutPtrArrays = new double[numRotatedElem*numOpenMPThreads];
+  double* col2imInputArrays = new double[numCol2ImElem*numOpenMPThreads];
+
+#pragma omp parallel for num_threads(numOpenMPThreads)
+  for (int n = 0; n < N; n++) {
+    // Step 1: Rotate dout
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num();
+    rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
+           R, S, stride_h, stride_w, pad_h, pad_w,
+           P, Q);
+
+    // Step 2: t(rotatedDout (PQ X K) %*% filter (K X CRS))
+    double* col2imInput = col2imInputArrays + numCol2ImElem*omp_get_thread_num();
+    matmult(rotatedDoutPtr, filterPtr, col2imInput,
+            PQ, K, CRS, 1);
+
+    // Step 3: Perform col2im
+    col2im(col2imInput, retPtr + n * CHW, 1, C, H, W, K,
+           R, S, stride_h, stride_w, pad_h, pad_w,
+           P, Q);
+
+  } // end omp parallel for
+  
+  delete [] rotatedDoutPtrArrays;
+  delete [] col2imInputArrays;
+    
+}
+
+void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filterPtr, double* retPtr, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
+	setNumThreadsForBLAS(1);
+	double* loweredMat = new double[C * R * S * P * Q];
+	
+	// Step 1: Perform im2col
+	double* temp = new double[C * H * W];
+	std::size_t size = C * H * W * sizeof(double);
+	std::memset(temp, 0, size);
+	for(int j=apos; j<apos+alen; j++)
+		temp[ aix[j] ] = avals[j];
+	im2col(temp, loweredMat, 1, C, H, W, K,
+       R, S, stride_h, stride_w, pad_h, pad_w,
+       P, Q);	
+	delete [] temp;
+	
+	// Step 2: filter (K X CRS) %*% loweredMat (CRS X PQ)
+    matmult(filterPtr, loweredMat, retPtr, K, C * R * S, P * Q, 1);
+    
+	delete [] loweredMat;
+}
+
+void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* retPtr, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
+	setNumThreadsForBLAS(1);
+	int CHW = C * H * W;
+	int CRS = C*R*S;
+	int PQ = P*Q;
+	int KPQ = K*PQ;
+	int m1 = CRS;
+	int n1 = K;
+	int k1 = PQ;
+	
+	double* loweredMat = new double[CRS * PQ];
+	
+	// Step 1: Perform im2col
+	double* temp = new double[C * H * W];
+	std::size_t size = C * H * W * sizeof(double);
+	std::memset(temp, 0, size);
+	for(int j=apos; j<apos+alen; j++)
+		temp[ aix[j] ] = avals[j];
+	im2col(temp, loweredMat, 1, C, H, W, K,
+       R, S, stride_h, stride_w, pad_h, pad_w,
+       P, Q);
+    delete [] temp;
+	
+	// Multiply to get CRS X K
+	double* temp1 = new double[CRS * K];
+	// Step 3: loweredMat (CRS X PQ) %*% rotatedDoutPtr (PQ X K) 
+    matmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1);
+    delete [] loweredMat;
+     
+    // Inplace addition
+    for(int iter = 0; iter<K*CRS; iter++) {
+    	retPtr[iter] += temp1[iter];
+    }
+    
+	delete [] temp1;
+}
+
+void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
+  // First step:  Avoids oversubscription and other openmp/internal blas threading issues
+  setNumThreadsForBLAS(1);
+  
+  int CHW = C * H * W;
+  int KPQ = K * P * Q;
+  int PQ = P * Q;
+  int numIm2ColElem = C * R * S * P * Q;
+  
+  // Allocate temporary data structures used in parallel for
+  int numOpenMPThreads = MIN(numThreads, N);
+  double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads];
+  
+#pragma omp parallel for num_threads(numOpenMPThreads)
+  for (int n = 0; n < N; n++) {
+    double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num();
+
+    // Step 1: Perform im2col
+    im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
+           R, S, stride_h, stride_w, pad_h, pad_w,
+           P, Q);
+
+    // Step 2: filter (K X CRS) %*% loweredMat (CRS X PQ)
+    matmult(filterPtr, loweredMat, retPtr + n * KPQ, K,
+            C * R * S, P * Q, 1);
+    
+    // Step 3: Add bias
+    if(addBias) {
+	    double* outputArr = retPtr + n*KPQ;
+	    int index = 0;
+		for(int k = 0; k < K; k++) {
+			for(int pq = 0; pq < PQ; pq++, index++) {
+				outputArr[index] += biasPtr[k];
+			}
+		}
+    }
+  } // end omp parallel for
+  
+  delete [] loweredMatArrays;
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/libmatrixdnn.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h
new file mode 100644
index 0000000..bf6c113
--- /dev/null
+++ b/src/main/cpp/libmatrixdnn.h
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+#ifndef _libmatrixdnn_h
+#define _libmatrixdnn_h
+
+void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+
+void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+    
+void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
+    
+void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+
+void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret, int N, int C, int H, int W, 
+			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+			
+#endif
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/libmatrixmult.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
new file mode 100644
index 0000000..6844c2a
--- /dev/null
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "config.h"
+#include "libmatrixmult.h"
+#include <cstdlib>
+#include "omp.h"
+#include <cmath>
+
+int SYSML_CURRENT_NUM_THREADS = -1;
+void setNumThreadsForBLAS(int numThreads) {
+	if(SYSML_CURRENT_NUM_THREADS != numThreads) {
+#ifdef USE_OPEN_BLAS
+		openblas_set_num_threads(numThreads);
+#else
+		mkl_set_num_threads(numThreads);
+#endif
+	    SYSML_CURRENT_NUM_THREADS = numThreads;
+	}
+}
+ 
+// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
+// (m1rlen, m1clen) and (m1clen, m2clen)
+void matmult(double* m1Ptr, double* m2Ptr, double* retPtr, int m1rlen,
+             int m1clen, int m2clen, int numThreads) {
+  int m = m1rlen;
+  int n = m2clen;
+  int k = m1clen;
+  
+  setNumThreadsForBLAS(numThreads);
+  
+  // if(m2clen == 1)
+  // 	cblas_dgemv(CblasRowMajor, CblasNoTrans, m1rlen, m1clen, 1, m1Ptr, m1clen, m2Ptr, 1, 0, retPtr, 1);
+  // else 
+  	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
+}
+
+void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose, int numThreads) {
+  int m = isLeftTranspose ? m1clen : m1rlen;
+  int n = isLeftTranspose ? m1clen : m1rlen;
+  int k = isLeftTranspose ? m1rlen : m1clen;
+  
+  setNumThreadsForBLAS(numThreads);
+  
+  cblas_dgemm(CblasRowMajor, isLeftTranspose ? CblasTrans : CblasNoTrans, isLeftTranspose ? CblasNoTrans : CblasTrans, m, n, k, 1, m1Ptr, k, m1Ptr, n, 0, retPtr, n);
+}
+ 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/libmatrixmult.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
new file mode 100644
index 0000000..d39242e
--- /dev/null
+++ b/src/main/cpp/libmatrixmult.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+#ifndef _libmatrixmult_h
+#define _libmatrixmult_h
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+// *****************************************************************
+// We support Intel MKL (recommended) or OpenBLAS.
+// These flags are used for conditional compilation with mkl and openblas
+// #define USE_INTEL_MKL
+// #define USE_GNU_THREADING
+// #define USE_OPEN_BLAS
+// *****************************************************************
+
+//#ifdef __cplusplus
+//extern "C" {
+//#endif
+//#ifdef __cplusplus
+//}
+//#endif
+
+// Since we call cblas_dgemm in openmp for loop,
+// we call "extension" APIs for setting number of threads of the given API.
+// For example: for OpenBLAS we use openblas_set_num_threads and  
+// for MKL we use mkl_set_num_threads. This avoids performance degradation due to overprovisioning.
+#ifdef USE_OPEN_BLAS
+#include <cblas.h>
+// extern "C" void openblas_set_num_threads(int numThreads);
+#elif defined USE_INTEL_MKL
+#include <mkl.h>
+#include <mkl_service.h>
+#endif
+
+void setNumThreadsForBLAS(int numThreads);
+
+// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
+// (m1rlen, m1clen) and (m1clen, m2clen)
+void matmult(double* m1Ptr, double* m2Ptr, double* retPtr, int m1rlen,
+             int m1clen, int m2clen, int numThreads);
+             
+void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose,  int numThreads);
+             
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/preload/preload_systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/preload/preload_systemml.cpp b/src/main/cpp/preload/preload_systemml.cpp
new file mode 100644
index 0000000..6ee20e0
--- /dev/null
+++ b/src/main/cpp/preload/preload_systemml.cpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "preload_systemml.h" 
+#include <cstdlib>
+ 
+//  g++ -o libpreload_systemml-linux-x86_64.so preload_systemml.cpp  -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -lm -ldl -O3 -shared -fPIC
+JNIEXPORT void JNICALL Java_org_apache_sysml_utils_EnvironmentHelper_setEnv(JNIEnv * env, jclass c, jstring jname, jstring jvalue) {
+	const char* name = (env)->GetStringUTFChars(jname, NULL);
+    	const char* value = (env)->GetStringUTFChars(jvalue,NULL);
+#if defined _WIN32 || defined _WIN64 
+	_putenv_s(name, value);
+#else 
+	setenv(name, value, 1);
+#endif
+	(env)->ReleaseStringUTFChars(jname, name); 
+    	(env)->ReleaseStringUTFChars(jvalue, value);
+}
+ 
+ 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/preload/preload_systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/preload/preload_systemml.h b/src/main/cpp/preload/preload_systemml.h
new file mode 100644
index 0000000..79d58f8
--- /dev/null
+++ b/src/main/cpp/preload/preload_systemml.h
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_apache_sysml_utils_EnvironmentHelper */
+
+#ifndef _Included_org_apache_sysml_utils_EnvironmentHelper
+#define _Included_org_apache_sysml_utils_EnvironmentHelper
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_apache_sysml_utils_EnvironmentHelper
+ * Method:    setEnv
+ * Signature: (Ljava/lang/String;Ljava/lang/String;)V
+ */
+JNIEXPORT void JNICALL Java_org_apache_sysml_utils_EnvironmentHelper_setEnv
+  (JNIEnv *, jclass, jstring, jstring);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
new file mode 100644
index 0000000..41ce0bc
--- /dev/null
+++ b/src/main/cpp/systemml.cpp
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+#include "config.h"
+#include "systemml.h"
+#include "libmatrixmult.h"
+#include "libmatrixdnn.h"
+
+// Linux:
+// g++ -o lib/libsystemml_mkl-Linux-x86_64.so *.cpp  -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread  -lm -ldl -DUSE_INTEL_MKL -DUSE_GNU_THREADING -L$MKLROOT/lib/intel64 -m64 -fopenmp -O3 -shared -fPIC
+// g++ -o lib/libsystemml_openblas-Linux-x86_64.so *.cpp  -I$JAVA_HOME/include  -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -I/opt/OpenBLAS/include/ -L/opt/OpenBLAS/lib/ -fopenmp -O3 -shared -fPIC
+
+// Mac OSX:
+// g++ -o libsystemml_mkl-linux-x86_64.dylib *.cpp  -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread  -lm -ldl -DUSE_INTEL_MKL -DUSE_GNU_THREADING -L$MKLROOT/lib/intel64 -m64 -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
+// g++ -o libsystemml_openblas-linux-x86_64.dylib *.cpp  -I$JAVA_HOME/include  -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -L/opt/OpenBLAS/lib/ -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
+
+// Windows MKL: 
+// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
+// "%MKLROOT%"\bin\mklvars.bat intel64
+// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
+// cl *.cpp -I. -I"%MKLROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_INTEL_MKL -Fesystemml_mkl-windows-x86_64.dll -MD -LD  "%MKLROOT%"\lib\intel64_win\mkl_intel_thread_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_core_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_intel_lp64_dll.lib
+// Windows OpenBLAS:
+// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
+// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
+// cl *.cpp -I. -I"%OPENBLASROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_OPEN_BLAS -Fesystemml_openblas-windows-x86_64.dll -MD -LD "%OPENBLASROOT%"\lib\libopenblas.dll.a
+
+// Results from Matrix-vector/vector-matrix 1M x 1K, dense show that GetDoubleArrayElements creates a copy on OpenJDK.
+
+// Logic:
+// 1. We chose GetDoubleArrayElements over GetPrimitiveArrayCritical in a multi-threaded scenario. This avoids any potential OOM related to GC halts.
+// 2. For input array, we don't copy back the array using JNI_ABORT.
+
+// JNI Methods to get/release double* 
+#define GET_DOUBLE_ARRAY(env, input, numThreads) \
+	((double*)env->GetPrimitiveArrayCritical(input, NULL))
+// ( maxThreads != -1 && ((int)numThreads) == maxThreads ? ((double*)env->GetPrimitiveArrayCritical(input, NULL)) :  env->GetDoubleArrayElements(input,NULL) )
+ 
+// ------------------------------------------------------------------- 
+// From: https://developer.android.com/training/articles/perf-jni.html
+// 0
+// Actual: the array object is un-pinned.
+// Copy: data is copied back. The buffer with the copy is freed.
+// JNI_ABORT
+// Actual: the array object is un-pinned. Earlier writes are not aborted.
+// Copy: the buffer with the copy is freed; any changes to it are lost.
+#define RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads) \
+	env->ReleasePrimitiveArrayCritical(input, inputPtr, JNI_ABORT)
+// ( maxThreads != -1 && ((int)numThreads) == maxThreads ? env->ReleasePrimitiveArrayCritical(input, inputPtr, JNI_ABORT) : env->ReleaseDoubleArrayElements(input, inputPtr, JNI_ABORT) )
+
+#define RELEASE_DOUBLE_ARRAY(env, input, inputPtr, numThreads) \
+	env->ReleasePrimitiveArrayCritical(input, inputPtr, 0)
+// ( maxThreads != -1 && ((int)numThreads) == maxThreads ? env->ReleasePrimitiveArrayCritical(input, inputPtr, 0) :  env->ReleaseDoubleArrayElements(input, inputPtr, 0) )
+  
+// -------------------------------------------------------------------
+
+int maxThreads = -1;
+JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
+  (JNIEnv *, jclass, jint jmaxThreads) {
+  maxThreads = (int) jmaxThreads;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense(
+    JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret,
+    jint m1rlen, jint m1clen, jint m2clen, jint numThreads) {
+  double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
+  double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+
+  matmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
+
+  RELEASE_INPUT_DOUBLE_ARRAY(env, m1, m1Ptr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, m2, m2Ptr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
+  (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean isLeftTranspose, jint numThreads) {
+  double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(m1Ptr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+
+  tsmm(m1Ptr, retPtr, (int) m1rlen, (int) m1clen, (bool) isLeftTranspose, (int) numThreads);
+  
+  RELEASE_INPUT_DOUBLE_ARRAY(env, m1, m1Ptr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
+  (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter,
+    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+  int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
+  double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
+  double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  
+  conv2dSparse((int)apos, (int)alen, aixPtr, avalsPtr, filterPtr, retPtr, (int)N, (int)C, (int)H, (int)W, 
+			(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+  
+  RELEASE_INPUT_DOUBLE_ARRAY(env, avals, avalsPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
+  env->ReleasePrimitiveArrayCritical(aix, aixPtr, JNI_ABORT);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
+  (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout,  
+  	jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+  int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
+  double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
+  double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  
+  conv2dBackwardFilterSparseDense((int)apos, (int)alen, aixPtr, avalsPtr, doutPtr, retPtr, (int)N, (int)C, (int)H, (int)W, 
+			(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+  
+  RELEASE_INPUT_DOUBLE_ARRAY(env, avals, avalsPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
+  env->ReleasePrimitiveArrayCritical(aix, aixPtr, JNI_ABORT);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
+	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
+    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+  double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
+  double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+  
+  conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads);
+    
+  RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense(
+	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
+    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+    
+  double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
+  double* biasPtr = GET_DOUBLE_ARRAY(env, bias, numThreads);
+  double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+  
+  conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
+    
+  RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, bias, biasPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense(
+	JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
+    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+  
+  double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
+  double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+  
+  conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+  
+  RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
+  return (jboolean) true;
+}
+
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense(
+	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
+    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+  double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
+  double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
+  double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
+  if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL)
+  	return (jboolean) false;
+  
+  conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+  
+  RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
+  RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
+  RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
+  return (jboolean) true;
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/cpp/systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h
new file mode 100644
index 0000000..ac36495
--- /dev/null
+++ b/src/main/cpp/systemml.h
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_apache_sysml_utils_NativeHelper */
+
+#ifndef _Included_org_apache_sysml_utils_NativeHelper
+#define _Included_org_apache_sysml_utils_NativeHelper
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    matrixMultDenseDense
+ * Signature: ([D[D[DIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    tsmm
+ * Signature: ([D[DIIZI)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jint, jint, jboolean, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dBiasAddDense
+ * Signature: ([D[D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dBackwardDataDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dBackwardFilterDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense
+  (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dSparse
+ * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
+  (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    conv2dBackwardFilterSparse
+ * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
+  (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    setMaxNumThreads
+ * Signature: (I)V
+ */
+JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
+  (JNIEnv *, jclass, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index ab059a0..bfac48d 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -706,7 +706,6 @@ public class DMLScript
 		}
 	}
 	
-	
 	///////////////////////////////
 	// private internal interface 
 	// (core compilation and execute)
@@ -1117,4 +1116,4 @@ public class DMLScript
 			throw new DMLException("Failed to run SystemML workspace cleanup.", ex);
 		}
 	}
-}  
\ No newline at end of file
+}  

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index e974a71..84bea42 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -71,6 +71,7 @@ public class DMLConfig
 	public static final String CP_PARALLEL_MATRIXMULT = "cp.parallel.matrixmult";
 	public static final String CP_PARALLEL_TEXTIO   = "cp.parallel.textio";
 	public static final String COMPRESSED_LINALG    = "compressed.linalg";
+	public static final String NATIVE_BLAS    			= "native.blas";
 	public static final String CODEGEN              = "codegen.enabled"; //boolean
 	public static final String CODEGEN_PLANCACHE    = "codegen.plancache"; //boolean
 	public static final String CODEGEN_LITERALS     = "codegen.literals"; //1..heuristic, 2..always
@@ -115,6 +116,7 @@ public class DMLConfig
 		_defaultVals.put(CODEGEN,                "false" );
 		_defaultVals.put(CODEGEN_PLANCACHE,      "true" );
 		_defaultVals.put(CODEGEN_LITERALS,       "1" );
+		_defaultVals.put(NATIVE_BLAS,      			 "true" );
 
 		_defaultVals.put(EXTRA_GPU_STATS,       "false" );
 		_defaultVals.put(EXTRA_DNN_STATS,       "false" );
@@ -404,7 +406,7 @@ public class DMLConfig
 				LOCAL_TMP_DIR,SCRATCH_SPACE,OPTIMIZATION_LEVEL,
 				NUM_REDUCERS, DEFAULT_BLOCK_SIZE,
 				YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM, 
-				CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO,
+				CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO, NATIVE_BLAS,
 				COMPRESSED_LINALG, CODEGEN, CODEGEN_LITERALS, CODEGEN_PLANCACHE,
 				EXTRA_GPU_STATS, EXTRA_DNN_STATS
 		}; 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index a097d19..a18aada 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -150,10 +150,17 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
 		OperationTypes lopOp = HopsConv2Lops.get(op);
 
-		if(op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
+		// RELU_MAX_POOLING and RELU_MAX_POOLING_BACKWARD is extremely useful for CP backend 
+		// by reducing unnecessary sparse-to-dense-to-sparse conversion.
+		// For other backends, this operators is not necessary as it reduces an additional relu operator.
+		if(et == ExecType.CP && op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
 			in = inputs.get(0).getInput().get(0).constructLops();
 			lopOp = OperationTypes.RELU_MAX_POOLING;
 		}
+		else if(et == ExecType.CP && op == ConvOp.MAX_POOLING_BACKWARD && isInputReLU(inputs.get(0))) {
+			in = inputs.get(0).getInput().get(0).constructLops();
+			lopOp = OperationTypes.RELU_MAX_POOLING_BACKWARD;
+		}
 		else if(op == ConvOp.BIAS_ADD && isInputConv2d(inputs.get(0))) {
 			lopOp = OperationTypes.DIRECT_CONV2D_BIAS_ADD;
 			

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 6d8885a..8784956 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -30,7 +30,7 @@ public class ConvolutionTransform extends Lop
 
 	
 	public enum OperationTypes {
-		MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD,
+		MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD, RELU_MAX_POOLING_BACKWARD,
 		DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA,
 		BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY
 	};
@@ -112,6 +112,9 @@ public class ConvolutionTransform extends Lop
 		case RELU_MAX_POOLING:
 			return "relu_maxpooling";
 			
+		case RELU_MAX_POOLING_BACKWARD:
+			return "relu_maxpooling_backward";
+			
 		case RELU_BACKWARD:
 			return "relu_backward";
 			

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index bb5e01a..52c11e6 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -225,6 +225,7 @@ public class CPInstructionParser extends InstructionParser
 		// Opcodes related to convolutions
 		String2CPInstructionType.put( "relu_backward"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "relu_maxpooling"      , CPINSTRUCTION_TYPE.Convolution);
+		String2CPInstructionType.put( "relu_maxpooling_backward"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "maxpooling"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "maxpooling_backward"      , CPINSTRUCTION_TYPE.Convolution);
 		String2CPInstructionType.put( "conv2d"      , CPINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 4051d6a..e5b3326 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -41,7 +41,6 @@ public class GPUInstructionParser  extends InstructionParser
 		// Neural Network Operators
 		String2GPUInstructionType.put( "relu_backward",          GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d",                 GPUINSTRUCTION_TYPE.Convolution);
-		String2GPUInstructionType.put( "relu_maxpooling",          GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d_bias_add",                 GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d_backward_data",   GPUINSTRUCTION_TYPE.Convolution);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/AggregateBinaryCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/AggregateBinaryCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/AggregateBinaryCPInstruction.java
index 702e4f3..59da4ac 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/AggregateBinaryCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/AggregateBinaryCPInstruction.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.runtime.instructions.cp;
 
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -71,15 +73,17 @@ public class AggregateBinaryCPInstruction extends BinaryCPInstruction
 	{	
 		//get inputs
 		MatrixBlock matBlock1 = ec.getMatrixInput(input1.getName());
-        MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName());
+    MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName());
 		
-        //compute matrix multiplication
-        AggregateBinaryOperator ab_op = (AggregateBinaryOperator) _optr;
+    //compute matrix multiplication
+    AggregateBinaryOperator ab_op = (AggregateBinaryOperator) _optr;
 		MatrixBlock soresBlock = null;
 		if( matBlock2 instanceof CompressedMatrixBlock )
 			soresBlock = (MatrixBlock) (matBlock2.aggregateBinaryOperations(matBlock1, matBlock2, new MatrixBlock(), ab_op));
-		else 
-			soresBlock = (MatrixBlock) (matBlock1.aggregateBinaryOperations(matBlock1, matBlock2, new MatrixBlock(), ab_op));
+		else  {
+			boolean enableNative = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+			soresBlock = (MatrixBlock) (matBlock1.aggregateBinaryOperations(matBlock1, matBlock2, new MatrixBlock(), ab_op, enableNative));
+		}
 			
 		//release inputs/outputs
 		ec.releaseMatrixInput(input1.getName());