You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/06/01 04:54:42 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master ceeec4bbf -> 772fb5883


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
new file mode 100644
index 0000000..4052fef
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Test Elementwise operations on the GPU
+ */
+public class MatrixMatrixElementWiseOpTests extends GPUTests {
+	private final static String TEST_NAME = "MatrixMatrixElementWiseOpTests";
+
+	private final int[] rowSizes = new int[] { 1, 64, 130, 1024, 2049 };
+	private final int[] columnSizes = new int[] { 1, 64, 130, 1024, 2049 };
+	private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+	private final double[] scalars = new double[] { 0.0, 0.5, 2.0 };
+	private final int seed = 42;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void testAxpy() {
+		runAxpyTest("O = a*X + Y", "X", "Y", "a", "O", "gpu_-*");
+	}
+
+	@Test
+	public void testAxmy() {
+		runAxpyTest("O = X - a*Y", "X", "Y", "a", "O", "gpu_+*");
+	}
+
+	@Test
+	public void testAdd() {
+		runMatrixMatrixElementwiseTest("O = X + Y", "X", "Y", "O", "gpu_+");
+	}
+
+	@Test
+	public void testMatrixColumnVectorAdd() {
+		runMatrixColumnVectorTest("O = X + Y", "X", "Y", "O", "gpu_+");
+	}
+
+	@Test
+	public void testMatrixRowVectorAdd() {
+		runMatrixRowVectorTest("O = X + Y", "X", "Y", "O", "gpu_+");
+	}
+
+	@Test
+	public void testSubtract() {
+		runMatrixMatrixElementwiseTest("O = X - Y", "X", "Y", "O", "gpu_-");
+	}
+
+	@Test
+	public void testMatrixColumnVectorSubtract() {
+		runMatrixColumnVectorTest("O = X - Y", "X", "Y", "O", "gpu_-");
+	}
+
+	@Test
+	public void testMatrixRowVectorSubtract() {
+		runMatrixRowVectorTest("O = X - Y", "X", "Y", "O", "gpu_-");
+	}
+
+	@Test
+	public void testMultiply() {
+		runMatrixMatrixElementwiseTest("O = X * Y", "X", "Y", "O", "gpu_*");
+	}
+
+	@Test
+	public void testMatrixColumnVectorMultiply() {
+		runMatrixColumnVectorTest("O = X * Y", "X", "Y", "O", "gpu_*");
+	}
+
+	@Test
+	public void testMatrixRowVectorMultiply() {
+		runMatrixRowVectorTest("O = X * Y", "X", "Y", "O", "gpu_*");
+	}
+
+	@Test
+	public void testDivide() {
+		runMatrixMatrixElementwiseTest("O = X / Y", "X", "Y", "O", "gpu_/");
+	}
+
+	@Test
+	public void testMatrixColumnVectorDivide() {
+		runMatrixColumnVectorTest("O = X / Y", "X", "Y", "O", "gpu_/");
+	}
+
+	@Test
+	public void testMatrixRowVectorDivide() {
+		runMatrixRowVectorTest("O = X / Y", "X", "Y", "O", "gpu_/");
+	}
+
+	// ****************************************************************
+	// ************************ IGNORED TEST **************************
+	// FIXME : There is a bug in CPU "^" when a A ^ B is executed where A & B are all zeroes
+	@Ignore
+	@Test
+	public void testPower() {
+		runMatrixMatrixElementwiseTest("O = X ^ Y", "X", "Y", "O", "gpu_%");
+	}
+
+	/**
+	 * Runs a simple matrix-matrix elementwise op test
+	 *
+	 * @param scriptStr         the script string
+	 * @param input1            name of the first input variable in the script string
+	 * @param input2            name of the second input variable in the script string
+	 * @param output            name of the output variable in the script string
+	 * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	private void runMatrixMatrixElementwiseTest(String scriptStr, String input1, String input2, String output,
+			String heavyHitterOpcode) {
+		for (int i = 0; i < rowSizes.length; i++) {
+			for (int j = 0; j < columnSizes.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					int m = rowSizes[i];
+					int n = columnSizes[j];
+					double sparsity = sparsities[k];
+					Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+					Matrix Y = generateInputMatrix(spark, m, n, sparsity, seed);
+					HashMap<String, Object> inputs = new HashMap<>();
+					inputs.put(input1, X);
+					inputs.put(input2, Y);
+					List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+					List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+					//assertHeavyHitterPresent(heavyHitterOpcode);
+					assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+				}
+			}
+		}
+	}
+
+	/**
+	 * Run O = aX +/- Y type operations test
+	 *
+	 * @param scriptStr         the script string
+	 * @param input1            name of the first matrix input variable in the script string
+	 * @param input2            name of the second matrix input variable in the script string
+	 * @param scalarInput       name of the scalar which is multiplied with the first or second matrix
+	 * @param output            name of the output variable in the script string
+	 * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	private void runAxpyTest(String scriptStr, String input1, String input2, String scalarInput, String output,
+			String heavyHitterOpcode) {
+		for (int i = 0; i < rowSizes.length; i++) {
+			for (int j = 0; j < columnSizes.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					for (int l = 0; l < scalars.length; l++) {
+						int m = rowSizes[i];
+						int n = columnSizes[j];
+						double scalar = scalars[l];
+						double sparsity = sparsities[k];
+						Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+						Matrix Y = generateInputMatrix(spark, m, n, sparsity, seed);
+						HashMap<String, Object> inputs = new HashMap<>();
+						inputs.put(input1, X);
+						inputs.put(input2, Y);
+						inputs.put(scalarInput, scalar);
+
+						// Test O = aX + Y
+						List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+						List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+						//assertHeavyHitterPresent(heavyHitterOpcode);
+						assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+					}
+				}
+			}
+		}
+	}
+
+	/**
+	 * Run O = X op Y where X is a matrix, Y is a column vector
+	 *
+	 * @param scriptStr         the script string
+	 * @param matrixInput       name of the matrix input variable in the script string
+	 * @param vectorInput       name of the vector input variable in the script string
+	 * @param output            name of the output variable in the script string
+	 * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	private void runMatrixColumnVectorTest(String scriptStr, String matrixInput, String vectorInput, String output,
+			String heavyHitterOpcode) {
+		int[] rows = new int[] { 64, 130, 1024, 2049 };
+		int[] cols = new int[] { 64, 130, 1024, 2049 };
+
+		for (int i = 0; i < rows.length; i++) {
+			for (int j = 0; j < cols.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					int m = rows[i];
+					int n = cols[j];
+					double sparsity = sparsities[k];
+					Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+					Matrix Y = generateInputMatrix(spark, m, 1, sparsity, seed);
+					HashMap<String, Object> inputs = new HashMap<>();
+					inputs.put(matrixInput, X);
+					inputs.put(vectorInput, Y);
+
+					System.out.println("Vector[" + m + ", 1] op Matrix[" + m + ", " + n + "], sparsity = " + sparsity);
+					List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+					List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+					//assertHeavyHitterPresent(heavyHitterOpcode);
+					assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+
+				}
+			}
+		}
+	}
+
+	/**
+	 * Run O = X op Y where X is a matrix, Y is a row vector
+	 *
+	 * @param scriptStr         the script string
+	 * @param matrixInput       name of the matrix input variable in the script string
+	 * @param vectorInput       name of the vector input variable in the script string
+	 * @param output            name of the output variable in the script string
+	 * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	private void runMatrixRowVectorTest(String scriptStr, String matrixInput, String vectorInput, String output,
+			String heavyHitterOpcode) {
+		int[] rows = new int[] { 64, 130, 1024, 2049 };
+		int[] cols = new int[] { 64, 130, 1024, 2049 };
+
+		for (int i = 0; i < rows.length; i++) {
+			for (int j = 0; j < cols.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					int m = rows[i];
+					int n = cols[j];
+					double sparsity = sparsities[k];
+					Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+					Matrix Y = generateInputMatrix(spark, 1, n, sparsity, seed);
+					HashMap<String, Object> inputs = new HashMap<>();
+					inputs.put(matrixInput, X);
+					inputs.put(vectorInput, Y);
+
+					System.out.println("Vector[" + m + ", 1] op Matrix[" + m + ", " + n + "], sparsity = " + sparsity);
+					List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+					List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+					//assertHeavyHitterPresent(heavyHitterOpcode);
+					assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+				}
+			}
+		}
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
new file mode 100644
index 0000000..f7c7851
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests matrix multiplication on the GPU
+ */
+public class MatrixMultiplicationOpTest extends GPUTests {
+	private final static String TEST_NAME = "MatrixMultiplicationOpTest";
+	private final int seed = 42;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Override
+	public double getTHRESHOLD() {
+		return 1e-5;
+	}
+
+	@Test
+	public void matrixMatrixTest1() {
+		String scriptStr = "O = X %*% Y";
+
+		int[] X1 = { 1, 128, 513, 1024 };
+		int[] X2 = { 128, 512, 1024 };
+		int[] Y2 = { 1, 128, 513, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y1 = x2;
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void matrixMatrixTest2() {
+		String scriptStr = "O = X %*% t(Y)";
+
+		int[] X1 = { 1, 128, 513, 1024 };
+		int[] X2 = { 128, 512, 1024 };
+		int[] Y1 = { 1, 128, 513, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				int y2 = x2;
+				for (int y1 = 0; y1 < Y1.length; y1++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], Y1[x2], X2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void matrixMatrixTest3() {
+		String scriptStr = "O = t(X) %*% Y";
+
+		int[] X1 = { 1, 128, 513, 1024 };
+		int[] X2 = { 128, 512, 1024 };
+		int[] Y2 = { 1, 128, 513, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			int y1 = x1;
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				for (int y2 = 0; y2 < Y2.length; y2++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X1[y1], Y2[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void matrixMatrixTest4() {
+		String scriptStr = "O = t(X) %*% t(Y)";
+
+		int[] X1 = { 1, 128, 513, 1024 };
+		int[] X2 = { 128, 512, 1024 };
+		int[] Y1 = { 1, 128, 513, 1024 };
+		double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+		double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int x1 = 0; x1 < X1.length; x1++) {
+			int y2 = x1;
+			for (int x2 = 0; x2 < X2.length; x2++) {
+				for (int y1 = 0; y1 < Y1.length; y1++) {
+					for (int sx = 0; sx < SX.length; sx++) {
+						for (int sy = 0; sy < SY.length; sy++) {
+							assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], Y1[y1], X1[y2], SX[sx], SY[sy]);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void transposeSelfMatrixMultiply() {
+		String scriptStr = "O = t(X) %*% X";
+
+		int[] sizes = { 1, 128, 512, 1024, 2049 };
+		double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
+
+		for (int i = 0; i < sizes.length; i++) {
+			for (int j = 0; j < sparsities.length; j++) {
+				int side = sizes[i];
+				double sparsity = sparsities[j];
+				Matrix X = generateInputMatrix(spark, side, side, sparsity, seed);
+				HashMap<String, Object> inputs = new HashMap<>();
+				inputs.put("X", X);
+				List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("O"));
+				List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("O"));
+				//assertHeavyHitterPresent("gpu_tsmm'");
+				assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+			}
+		}
+	}
+
+	/**
+	 * Assert that matrix multiplication is the same on gpu and cpu
+	 *
+	 * @param scriptStr script string that has matrix multiplication (eg : O = X %*% Y)
+	 * @param rows1     rows in X
+	 * @param cols1     cols in X
+	 * @param rows2     rows in Y
+	 * @param cols2     cols in Y
+	 * @param sparsity1 sparsity for X
+	 * @param sparsity2 sparsity for Y
+	 */
+	private void assertMatrixMultiplication(String scriptStr, int rows1, int cols1, int rows2, int cols2,
+			double sparsity1, double sparsity2) {
+		HashMap<String, Object> inputs = new HashMap<>();
+		Matrix X = generateInputMatrix(spark, rows1, cols1, sparsity1, seed);
+		Matrix Y = generateInputMatrix(spark, rows2, cols2, sparsity2, seed);
+		inputs.put("X", X);
+		inputs.put("Y", Y);
+		List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("O"));
+		List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("O"));
+		//assertHeavyHitterPresent("gpu_ba+*'");
+		assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
new file mode 100644
index 0000000..f1f1ea5
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -0,0 +1,508 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Test neural network operations on the GPU
+ */
+public class NeuralNetworkOpTests extends GPUTests {
+
+	private final static String TEST_NAME = "NeuralNetworkOpTests";
+	private final int seed = 42;
+
+	// The MAX_OP_SIZE is to take into consideration the memory available on the GPU as well as
+	// limits set by cudnn (operands need to be less than 2GB)
+	private static final double MAX_OP_SIZE;
+
+	static {
+		double MAX = 0.5 * 1024 * 1024 * 1024; // 0.5 GB (this HAS to be less than 2GB)
+		try {
+			// Cap the maximum allowed operand size to 1/3rd of the usable GPU memory or MAX, whichever is lesser
+			GPUContext gCtx = GPUContextPool.getFromPool();
+			long availableMemory = gCtx.getAvailableMemory();
+			double averageMemoryPerOperand = availableMemory / 3.0;
+			MAX_OP_SIZE = Math.min(averageMemoryPerOperand, MAX);
+			GPUContextPool.returnToPool(gCtx);
+		} catch (DMLRuntimeException e) {
+			throw new RuntimeException(e);
+		}
+
+	}
+
+	// More comprehensive but time consuming tests
+	/*
+	private final List<Integer> Nlst = Arrays.asList(128, 64, 32);
+    private final List<Integer> Clst = Arrays.asList(30, 20, 3);
+    private final List<Integer> Hlst = Arrays.asList(400, 128, 32);
+    private final List<Integer> Wlst = Arrays.asList(400, 128, 32);
+    private final List<Integer> Klst = Arrays.asList(30, 20, 10);
+    private final List<Integer> Rlst = Arrays.asList(128, 63, 4);
+    private final List<Integer> Slst = Arrays.asList(128, 63, 4);
+    private final List<Integer> strideHeightLst = Arrays.asList(9, 3);
+    private final List<Integer> strideWidthLst = Arrays.asList(9, 3);
+    private final List<Integer> padHeightLst = Arrays.asList(3, 1);
+    private final List<Integer> padWidthLst = Arrays.asList(3, 1);
+    private final List<Double> sparsitylst = Arrays.asList(1.0);    // Only test for dense
+    */
+
+	private final List<Integer> Nlst = Arrays.asList(128, 64);
+	private final List<Integer> Clst = Arrays.asList(30, 3);
+	private final List<Integer> Hlst = Arrays.asList(256, 64);
+	private final List<Integer> Wlst = Arrays.asList(256, 64);
+	private final List<Integer> Klst = Arrays.asList(30, 20);
+	private final List<Integer> Rlst = Arrays.asList(128, 3, 1);
+	private final List<Integer> Slst = Arrays.asList(128, 3, 1);
+	private final List<Integer> strideHeightLst = Arrays.asList(9, 1);
+	private final List<Integer> strideWidthLst = Arrays.asList(9, 1);
+	private final List<Integer> padHeightLst = Arrays.asList(3, 1);
+	private final List<Integer> padWidthLst = Arrays.asList(3, 1);
+	private final List<Double> sparsitylst = Arrays.asList(1.0);    // Only test for dense
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Override
+	public double getTHRESHOLD() {
+		return 1e-5;
+	}
+
+	@Test
+	public void testConv2d() {
+		String scriptStr = "O = conv2d(image, filter, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					for (long W : Wlst) {
+						for (long K : Klst) {
+							for (long R : Rlst) {
+								for (long S : Slst) {
+									for (long strideH : strideHeightLst) {
+										for (long strideW : strideWidthLst) {
+											for (long padH : padHeightLst) {
+												for (long padW : padWidthLst) {
+													for (double sparsity : sparsitylst) {
+
+														// Make sure ops fit in GPU memory and within constraints of cudnn
+														long imageSize = N * C * H * W * 8l;
+														if (imageSize > MAX_OP_SIZE)  // image size
+															continue;
+														long filterSize = K * C * R * S * 8l;
+														if (filterSize > MAX_OP_SIZE)  // filter size
+															continue;
+														// filter is smaller than image + padding
+														if (R > (H + padH) || S > (W + padW))
+															continue;
+
+														int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+														int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+														long doutSize = N * K * P * Q * 8l;
+														if (doutSize > MAX_OP_SIZE) // dout/output size
+															continue;
+
+														double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+														double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+														double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+														System.out
+																.format("conv2d, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+																		N, C, H, W, imageSizeInMB, N, C, R, S,
+																		filterSizeInMB, N, K, P, Q, doutSizeInMB,
+																		strideH, strideW, padH, padW);
+														Matrix image = generateInputMatrix(spark, (int) N,
+																(int) (C * H * W), sparsity, seed);
+														Matrix filter = generateInputMatrix(spark, (int) K,
+																(int) (C * R * S), sparsity, seed);
+														HashMap<String, Object> inputs = new HashMap<>();
+														inputs.put("N", N);
+														inputs.put("C", C);
+														inputs.put("H", H);
+														inputs.put("W", W);
+														inputs.put("K", K);
+														inputs.put("R", R);
+														inputs.put("S", S);
+														inputs.put("strideH", strideH);
+														inputs.put("strideW", strideW);
+														inputs.put("padH", padH);
+														inputs.put("padW", padW);
+														inputs.put("image", image);
+														inputs.put("filter", filter);
+														List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														assertHeavyHitterPresent("gpu_conv2d");
+														assertEqualObjects(outCPU.get(0), outGPU.get(0));
+														clearGPUMemory();
+													}
+												}
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void testConv2dBackwardFilter() {
+		String scriptStr = "O = conv2d_backward_filter(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					for (long W : Wlst) {
+						for (long K : Klst) {
+							for (long R : Rlst) {
+								for (long S : Slst) {
+									for (long strideH : strideHeightLst) {
+										for (long strideW : strideWidthLst) {
+											for (long padH : padHeightLst) {
+												for (long padW : padWidthLst) {
+													for (double sparsity : sparsitylst) {
+
+														// filter is smaller than image + padding
+														if (R > (H + padH) || S > (W + padW))
+															continue;
+
+														// Make sure ops fit in GPU memory and within constraints of cudnn
+														long imageSize = N * C * H * W * 8l;
+														if (imageSize > MAX_OP_SIZE)  // image size
+															continue;
+														long filterSize = K * C * R * S * 8l;
+														if (filterSize > MAX_OP_SIZE)  // filter size
+															continue;
+
+														int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+														int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+														long doutSize = N * K * P * Q * 8l;
+														if (doutSize > MAX_OP_SIZE) // dout/output size
+															continue;
+
+														double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+														double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+														double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+														System.out
+																.format("conv2d_backward_filter, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+																		N, C, H, W, imageSizeInMB, N, C, R, S,
+																		filterSizeInMB, N, K, P, Q, doutSizeInMB,
+																		strideH, strideW, padH, padW);
+														Matrix image = generateInputMatrix(spark, (int) N,
+																(int) (C * H * W), sparsity, seed);
+														Matrix dout = generateInputMatrix(spark, (int) N,
+																(int) (K * P * Q), sparsity, seed);
+														HashMap<String, Object> inputs = new HashMap<>();
+														inputs.put("N", N);
+														inputs.put("C", C);
+														inputs.put("H", H);
+														inputs.put("W", W);
+														inputs.put("K", K);
+														inputs.put("R", R);
+														inputs.put("S", S);
+														inputs.put("strideH", strideH);
+														inputs.put("strideW", strideW);
+														inputs.put("padH", padH);
+														inputs.put("padW", padW);
+														inputs.put("image", image);
+														inputs.put("dout", dout);
+														List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														assertHeavyHitterPresent("gpu_conv2d_backward_filter");
+														assertEqualObjects(outCPU.get(0), outGPU.get(0));
+														clearGPUMemory();
+													}
+												}
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void testConv2dBackwardData() {
+		String scriptStr = "O = conv2d_backward_data(filter, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					for (long W : Wlst) {
+						for (long K : Klst) {
+							for (long R : Rlst) {
+								for (long S : Slst) {
+									for (long strideH : strideHeightLst) {
+										for (long strideW : strideWidthLst) {
+											for (long padH : padHeightLst) {
+												for (long padW : padWidthLst) {
+													for (double sparsity : sparsitylst) {
+
+														// filter is smaller than image + padding
+														if (R > (H + padH) || S > (W + padW))
+															continue;
+
+														// Make sure ops fit in GPU memory and within constraints of cudnn
+														long imageSize = N * C * H * W * 8l;
+														if (imageSize > MAX_OP_SIZE)  // image size
+															continue;
+														long filterSize = K * C * R * S * 8l;
+														if (filterSize > MAX_OP_SIZE)  // filter size
+															continue;
+
+														int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+														int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+														long doutSize = N * K * P * Q * 8l;
+														if (doutSize > MAX_OP_SIZE) // dout/output size
+															continue;
+
+														double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+														double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+														double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+														System.out
+																.format("conv2d_backward_data, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+																		N, C, H, W, imageSizeInMB, N, C, R, S,
+																		filterSizeInMB, N, K, P, Q, doutSizeInMB,
+																		strideH, strideW, padH, padW);
+
+														Matrix filter = generateInputMatrix(spark, (int) K,
+																(int) (C * R * S), sparsity, seed);
+														Matrix dout = generateInputMatrix(spark, (int) N,
+																(int) (K * P * Q), sparsity, seed);
+														HashMap<String, Object> inputs = new HashMap<>();
+														inputs.put("N", N);
+														inputs.put("C", C);
+														inputs.put("H", H);
+														inputs.put("W", W);
+														inputs.put("K", K);
+														inputs.put("R", R);
+														inputs.put("S", S);
+														inputs.put("strideH", strideH);
+														inputs.put("strideW", strideW);
+														inputs.put("padH", padH);
+														inputs.put("padW", padW);
+														inputs.put("filter", filter);
+														inputs.put("dout", dout);
+														List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+																Arrays.asList("O"));
+														assertHeavyHitterPresent("gpu_conv2d_backward_data");
+														assertEqualObjects(outCPU.get(0), outGPU.get(0));
+														clearGPUMemory();
+													}
+												}
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void testMaxPool() {
+		String scriptStr = "O = max_pool(image, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					for (long W : Wlst) {
+						for (long R : Rlst) {
+							for (long S : Slst) {
+								for (long strideH : strideHeightLst) {
+									for (long strideW : strideWidthLst) {
+										for (long padH : padHeightLst) {
+											for (long padW : padWidthLst) {
+												for (double sparsity : sparsitylst) {
+
+													// pool is smaller than image + padding
+													if (R > (H + padH) || S > (W + padW))
+														continue;
+
+													// Make sure ops fit in GPU memory and within constraints of cudnn
+													long imageSize = N * C * H * W * 8l;
+													if (imageSize > MAX_OP_SIZE)  // image size
+														continue;
+													long poolSize = R * S * 8l;
+													if (poolSize > MAX_OP_SIZE)  // filter size
+														continue;
+
+													int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+													int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+													long doutSize = N * C * P * Q * 8l;
+													if (doutSize > MAX_OP_SIZE) // dout/output size
+														continue;
+
+													double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+													double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+													double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+													System.out
+															.format("max_pool, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+																	N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+																	P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+													Matrix image = generateInputMatrix(spark, (int) N,
+															(int) (C * H * W), sparsity, seed);
+													HashMap<String, Object> inputs = new HashMap<>();
+													inputs.put("N", N);
+													inputs.put("C", C);
+													inputs.put("H", H);
+													inputs.put("W", W);
+													inputs.put("R", R);
+													inputs.put("S", S);
+													inputs.put("strideH", strideH);
+													inputs.put("strideW", strideW);
+													inputs.put("padH", padH);
+													inputs.put("padW", padW);
+													inputs.put("image", image);
+													List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+															Arrays.asList("O"));
+													List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+															Arrays.asList("O"));
+													assertHeavyHitterPresent("gpu_maxpooling");
+													assertEqualObjects(outCPU.get(0), outGPU.get(0));
+													clearGPUMemory();
+												}
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Test
+	public void testMaxPoolBackward() {
+		String scriptStr = "O = max_pool_backward(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+		for (long N : Nlst) {
+			for (long C : Clst) {
+				for (long H : Hlst) {
+					for (long W : Wlst) {
+						for (long R : Rlst) {
+							for (long S : Slst) {
+								for (long strideH : strideHeightLst) {
+									for (long strideW : strideWidthLst) {
+										for (long padH : padHeightLst) {
+											for (long padW : padWidthLst) {
+												for (double sparsity : sparsitylst) {
+
+													// pool is smaller than image + padding
+													if (R > (H + padH) || S > (W + padW))
+														continue;
+
+													// Make sure ops fit in GPU memory and within constraints of cudnn
+													long imageSize = N * C * H * W * 8l;
+													if (imageSize > MAX_OP_SIZE)  // image size
+														continue;
+													long poolSize = R * S * 8l;
+													if (poolSize > MAX_OP_SIZE)  // filter size
+														continue;
+
+													int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+													int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+													long doutSize = N * C * P * Q * 8l;
+													if (doutSize > MAX_OP_SIZE) // dout/output size
+														continue;
+
+													double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+													double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+													double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+													System.out
+															.format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+																	N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+																	P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+													Matrix image = generateInputMatrix(spark, (int) N,
+															(int) (C * H * W), sparsity, seed);
+													Matrix dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+															sparsity, seed);
+													HashMap<String, Object> inputs = new HashMap<>();
+													inputs.put("N", N);
+													inputs.put("C", C);
+													inputs.put("H", H);
+													inputs.put("W", W);
+													inputs.put("R", R);
+													inputs.put("S", S);
+													inputs.put("strideH", strideH);
+													inputs.put("strideW", strideW);
+													inputs.put("padH", padH);
+													inputs.put("padW", padW);
+													inputs.put("image", image);
+													inputs.put("dout", dout);
+													List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+															Arrays.asList("O"));
+													List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+															Arrays.asList("O"));
+													assertHeavyHitterPresent("gpu_maxpooling_backward");
+													assertEqualObjects(outCPU.get(0), outGPU.get(0));
+													clearGPUMemory();
+												}
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java b/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
new file mode 100644
index 0000000..b5b71f8
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests for GPU transpose
+ */
+public class ReorgOpTests extends GPUTests {
+
+	private final static String TEST_NAME = "ReorgOpTests";
+	private final int[] rowSizes = new int[] { 1, 64, 130, 1024, 2049 };
+	private final int[] columnSizes = new int[] { 1, 64, 130, 1024, 2049 };
+	private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+	private final int seed = 42;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void transposeTest() {
+		String scriptStr = "out = t(in1)";
+
+		for (int i = 0; i < rowSizes.length; i++) {
+			for (int j = 0; j < columnSizes.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					int m = rowSizes[i];
+					int n = columnSizes[j];
+					double sparsity = sparsities[k];
+					HashMap<String, Object> inputs = new HashMap<>();
+					Matrix in1 = generateInputMatrix(spark, m, n, sparsity, seed);
+					inputs.put("in1", in1);
+					List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("out"));
+					List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("out"));
+					//assertHeavyHitterPresent("gpu_r'");
+					assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+				}
+			}
+		}
+
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
new file mode 100644
index 0000000..65e6365
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Tests scalar-matrix element wise operations on the GPU
+ */
+public class ScalarMatrixElementwiseOpTests extends GPUTests {
+
+	private final static String TEST_NAME = "ScalarMatrixElementwiseOpTests";
+
+	private final int[] rowSizes = new int[] { 1, 64, 130, 2049 };
+	private final int[] columnSizes = new int[] { 1, 64, 130, 2049 };
+	private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+	private final int seed = 42;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void testPlusRightScalar() {
+		runScalarMatrixElementWiseTests("O = X + scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 20.0 }, "gpu_+");
+	}
+
+	@Test
+	public void testPlusLeftScalar() {
+		runScalarMatrixElementWiseTests("O = scalar + X", "X", "scalar", "O", new double[] { 0.0, 0.5, 20.0 }, "gpu_+");
+	}
+
+	@Test
+	public void testMinusRightScalar() {
+		runScalarMatrixElementWiseTests("O = X - scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 1.0 }, "gpu_-");
+	}
+
+	@Test
+	public void testMinusLeftScalar() {
+		runScalarMatrixElementWiseTests("O = scalar - X", "X", "scalar", "O", new double[] { 0.0, 0.5, 1.0 }, "gpu_-");
+	}
+
+	@Test
+	public void testMultRightScalar() {
+		runScalarMatrixElementWiseTests("O = X * scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 2.0 }, "gpu_*");
+	}
+
+	@Test
+	public void testMultLeftScalar() {
+		runScalarMatrixElementWiseTests("O = scalar * X", "X", "scalar", "O", new double[] { 0.0, 0.5, 2.0 }, "gpu_*");
+	}
+
+	@Test
+	public void testDivide() {
+		runScalarMatrixElementWiseTests("O = X / scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 5.0 }, "gpu_/");
+	}
+
+	// ****************************************************************
+	// ************************ IGNORED TEST **************************
+	// FIXME : There is a bug in CPU "^" when a A ^ B is executed where A & B are all zeroes
+	@Ignore
+	@Test
+	public void testPow() {
+		runScalarMatrixElementWiseTests("O = X ^ scalar", "X", "scalar", "O", new double[] { 0.0, 2.0, 10.0 }, "gpu_^");
+	}
+
+	/**
+	 * Runs a simple scalar-matrix elementwise op test
+	 *
+	 * @param scriptStr         the script string
+	 * @param inputMatrix       name of the matrix input in the script string
+	 * @param inputScalar       name of the scalar input in the script string
+	 * @param output            name of the output variable in the script string
+	 * @param scalars           array of scalars for which to run this test
+	 * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	private void runScalarMatrixElementWiseTests(String scriptStr, String inputMatrix, String inputScalar,
+			String output, double[] scalars, String heavyHitterOpCode) {
+		for (int i = 0; i < rowSizes.length; i++) {
+			for (int j = 0; j < columnSizes.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					for (int l = 0; l < scalars.length; l++) {
+						int m = rowSizes[i];
+						int n = columnSizes[j];
+						double sparsity = sparsities[k];
+						double scalar = scalars[l];
+						System.out.println(
+								"Matrix is of size [" + m + ", " + n + "], sparsity = " + sparsity + ", scalar = "
+										+ scalar);
+						Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+						HashMap<String, Object> inputs = new HashMap<>();
+						inputs.put(inputMatrix, X);
+						inputs.put(inputScalar, scalar);
+						List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+						List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+						//assertHeavyHitterPresent(heavyHitterOpCode);
+						assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+					}
+				}
+			}
+		}
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
new file mode 100644
index 0000000..84b1f73
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Unit tests for Unary ops on GPU
+ */
+public class UnaryOpTests extends UnaryOpTestsBase {
+
+	private final static String TEST_NAME = "UnaryOpTests";
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void testSin() throws Exception {
+		testSimpleUnaryOpMatrixOutput("sin", "gpu_sin");
+	}
+
+	@Test
+	public void testCos() throws Exception {
+		testSimpleUnaryOpMatrixOutput("cos", "gpu_cos");
+	}
+
+	@Test
+	public void testTan() throws Exception {
+		testSimpleUnaryOpMatrixOutput("tan", "gpu_tan");
+	}
+
+	@Test
+	public void testAsin() throws Exception {
+		testSimpleUnaryOpMatrixOutput("asin", "gpu_asin");
+	}
+
+	@Test
+	public void testAcos() throws Exception {
+		testSimpleUnaryOpMatrixOutput("acos", "gpu_acos");
+	}
+
+	@Test
+	public void testAtan() throws Exception {
+		testSimpleUnaryOpMatrixOutput("atan", "gpu_atan");
+	}
+
+	@Test
+	public void testExp() throws Exception {
+		testSimpleUnaryOpMatrixOutput("exp", "gpu_exp");
+	}
+
+	@Test
+	public void testLog() throws Exception {
+		testSimpleUnaryOpMatrixOutput("log", "gpu_log");
+	}
+
+	@Test
+	public void testSqrt() throws Exception {
+		testSimpleUnaryOpMatrixOutput("sqrt", "gpu_sqrt");
+	}
+
+	@Test
+	public void testAbs() throws Exception {
+		testSimpleUnaryOpMatrixOutput("abs", "gpu_abs");
+	}
+
+	@Test
+	public void testRound() throws Exception {
+		testSimpleUnaryOpMatrixOutput("round", "gpu_round");
+	}
+
+	@Test
+	public void testFloor() throws Exception {
+		testSimpleUnaryOpMatrixOutput("sqrt", "gpu_floor");
+	}
+
+	@Test
+	public void testCeil() throws Exception {
+		testSimpleUnaryOpMatrixOutput("ceil", "gpu_ceil");
+	}
+
+	@Test
+	public void testSign() throws Exception {
+		testSimpleUnaryOpMatrixOutput("sign", "gpu_sign");
+	}
+
+	@Test
+	public void testSelp() throws Exception {
+		testUnaryOpMatrixOutput("out = max(in1, 0)", "gpu_selp", "in1", "out");
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
new file mode 100644
index 0000000..0051dd4
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+
+/**
+ * Abstract class for all Unary Op tests
+ */
+public abstract class UnaryOpTestsBase extends GPUTests {
+
+	// Set of rows and column sizes & sparsities to test unary ops
+	private final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 };
+	private final int[] columnSizes = new int[] { 2049, 1024, 140, 64, 1 };
+	private final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 };
+	private final int seed = 42;
+
+	/**
+	 * Tests unary ops with a variety of matrix shapes and sparsities.
+	 * Test is skipped for blocks of size 1x1.
+	 *
+	 * @param function          name of the dml builtin unary op
+	 * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+	 */
+	protected void testSimpleUnaryOpMatrixOutput(String function, String heavyHitterOpCode) {
+		String scriptStr = "out = " + function + "(in1)";
+		testUnaryOpMatrixOutput(scriptStr, heavyHitterOpCode, "in1", "out");
+	}
+
+	/**
+	 * Tests slightly more involved unary ops with a variety of matrix shapes and sparsities.
+	 * Test is skipped for blocks of size 1x1
+	 *
+	 * @param scriptStr         script string
+	 * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+	 * @param inStr             name of input variable in provided script string
+	 * @param outStr            name of output variable in script string
+	 */
+	protected void testUnaryOpMatrixOutput(String scriptStr, String heavyHitterOpCode, String inStr, String outStr) {
+		int[] rows = rowSizes;
+		int[] columns = columnSizes;
+		double[] sparsities = this.sparsities;
+		int seed = this.seed;
+
+		for (int i = 0; i < rows.length; i++) {
+			for (int j = 0; j < columns.length; j++) {
+				for (int k = 0; k < sparsities.length; k++) {
+					int row = rows[i];
+					int column = columns[j];
+					double sparsity = sparsities[k];
+					// Skip the case of a scalar unary op
+					if (row == 1 && column == 1)
+						continue;
+
+					testUnaryOpMatrixOutput(scriptStr, heavyHitterOpCode, inStr, outStr, seed, row, column, sparsity);
+				}
+			}
+		}
+	}
+
+	/**
+	 * Tests a single unary op with inputs and outputs of the specified size and sparsity
+	 *
+	 * @param scriptStr         script string
+	 * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+	 * @param inStr             name of input variable in provided script string
+	 * @param outStr            name of output variable in script string
+	 * @param seed              seed for the random number generator for the random input matrix
+	 * @param row               number of rows of input matrix
+	 * @param column            number of rows of input matrix
+	 * @param sparsity          sparsity of the input matrix
+	 */
+	public void testUnaryOpMatrixOutput(String scriptStr, String heavyHitterOpCode, String inStr, String outStr,
+			int seed, int row, int column, double sparsity) {
+		System.out.println("Matrix of size [" + row + ", " + column + "], sparsity = " + sparsity);
+		Matrix in1 = generateInputMatrix(spark, row, column, sparsity, seed);
+		HashMap<String, Object> inputs = new HashMap<>();
+		inputs.put(inStr, in1);
+		List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList(outStr));
+		List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList(outStr));
+		//assertHeavyHitterPresent(heavyHitterOpCode);
+		assertEqualObjects(outCPU.get(0), outGPU.get(0));
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
new file mode 100644
index 0000000..d5e3bc0
--- /dev/null
+++ b/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.gpu;
+
+import org.apache.sysml.test.gpu.AggregateUnaryOpTests;
+import org.apache.sysml.test.gpu.BinaryOpTests;
+import org.apache.sysml.test.gpu.MatrixMatrixElementWiseOpTests;
+import org.apache.sysml.test.gpu.MatrixMultiplicationOpTest;
+import org.apache.sysml.test.gpu.NeuralNetworkOpTests;
+import org.apache.sysml.test.gpu.ReorgOpTests;
+import org.apache.sysml.test.gpu.ScalarMatrixElementwiseOpTests;
+import org.apache.sysml.test.gpu.UnaryOpTests;
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.junit.runners.Suite.SuiteClasses;
+
+@RunWith(Suite.class) @SuiteClasses({
+	BinaryOpTests.class,
+    ScalarMatrixElementwiseOpTests.class,
+	MatrixMatrixElementWiseOpTests.class,
+	ReorgOpTests.class,
+	AggregateUnaryOpTests.class,
+	UnaryOpTests.class,
+	MatrixMultiplicationOpTest.class,
+    NeuralNetworkOpTests.class,
+})
+public class ZPackageSuite {
+
+}


[2/2] incubator-systemml git commit: [SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)

Posted by na...@apache.org.
[SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)

- Documented random matrix generation
- GPU unit test using MLContext. Compares CPU output to GPU
- Pseudo-unit tests for GPU implementations of
  unary ops, unary aggregate ops, transpose, elementwise ops,
  matrix multiplication ops, builtin ops & NN ops
- Fixed crucial bug in col/row var
- gpuTests profile for GPU tests (mvn verify -PgpuTests)
- Updated intellij style for import order

Closes #513


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/772fb588
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/772fb588
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/772fb588

Branch: refs/heads/master
Commit: 772fb588324916e4225bb6e1970ca6a8f87eb414
Parents: ceeec4b
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 31 21:54:13 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 31 21:54:13 2017 -0700

----------------------------------------------------------------------
 dev/code-style/systemml-style-intellij.xml      |  18 +
 pom.xml                                         |  10 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |   1 +
 .../context/ExecutionContext.java               |   3 +
 .../instructions/GPUInstructionParser.java      | 120 +++--
 .../instructions/gpu/context/GPUContext.java    | 118 +++--
 .../instructions/gpu/context/GPUObject.java     |  55 +-
 .../instructions/gpu/context/JCudaKernels.java  |   3 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 112 ++--
 .../runtime/matrix/data/LibMatrixDatagen.java   |  78 +--
 .../matrix/data/RandomMatrixGenerator.java      | 123 ++++-
 .../sysml/test/gpu/AggregateUnaryOpTests.java   | 133 +++++
 .../apache/sysml/test/gpu/BinaryOpTests.java    |  85 ++++
 .../org/apache/sysml/test/gpu/GPUTests.java     | 250 +++++++++
 .../gpu/MatrixMatrixElementWiseOpTests.java     | 271 ++++++++++
 .../test/gpu/MatrixMultiplicationOpTest.java    | 190 +++++++
 .../sysml/test/gpu/NeuralNetworkOpTests.java    | 508 +++++++++++++++++++
 .../org/apache/sysml/test/gpu/ReorgOpTests.java |  70 +++
 .../gpu/ScalarMatrixElementwiseOpTests.java     | 131 +++++
 .../org/apache/sysml/test/gpu/UnaryOpTests.java | 113 +++++
 .../apache/sysml/test/gpu/UnaryOpTestsBase.java | 106 ++++
 .../test/integration/gpu/ZPackageSuite.java     |  46 ++
 22 files changed, 2308 insertions(+), 236 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml
index 248c600..1ad3209 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -16,7 +16,25 @@
  * specific language governing permissions and limitations
  * under the License.
 -->
+
 <code_scheme name="SystemML Format">
+  <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+  <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+  <option name="IMPORT_LAYOUT_TABLE">
+    <value>
+      <package name="" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="java" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="javax" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="org" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="com" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="" withSubpackages="true" static="true" />
+    </value>
+  </option>
   <codeStyleSettings language="JAVA">
     <option name="KEEP_LINE_BREAKS" value="false" />
     <option name="KEEP_FIRST_COLUMN_COMMENT" value="false" />

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 5ce5576..99e2dec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -436,6 +436,7 @@
 						<exclude>**/slowtest/**</exclude>
 						<exclude>**/integration/**</exclude>
 						<exclude>**/test/unit/**</exclude>
+						<exclude>**/test/gpu/**</exclude>
 					</excludes>
 
 				</configuration>
@@ -478,6 +479,7 @@
 						-Djava.awt.headless=true</argLine>
 
 					<includes>
+						<include>${gpuTestsPath}</include> <!-- Path for GPU integration tests, enabled for gpuTests profile -->
 						<include>**/integration/applications/**/*Suite.java</include>
 						<include>**/integration/conversion/*Suite.java</include>
 						<include>**/integration/functions/data/*Suite.java</include>
@@ -896,6 +898,14 @@
 			</build>
 		</profile>
 
+		<!-- profile to enable running tests on the GPU -->
+		<profile>
+			<id>gpuTests</id>
+			<properties>
+				<gpuTestsPath>**/integration/gpu/**/*Suite.java</gpuTestsPath>
+			</properties>
+		</profile>
+
 		<profile>
 			<!-- Can be used to ignore doclint javadoc issues -->
 			<id>ignore-doclint</id>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index f582c36..674a011 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -94,6 +94,7 @@ public class ScriptExecutorUtils {
 			rtprog.execute(ec);
 		} finally { // ensure cleanup/shutdown
 			if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) {
+				ec.getGPUContext().clearTemporaryMemory();
 				GPUContextPool.returnToPool(ec.getGPUContext());
 			}
 			if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 35b4cd1..735f394 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -271,6 +271,9 @@ public class ExecutionContext {
 		MatrixObject mo = getMatrixObject(varName);
 		if( mo.getGPUObject(getGPUContext()) == null ) {
 			GPUObject newGObj = getGPUContext().createGPUObject(mo);
+			// The lock is added here for an output block
+			// so that any block currently in use is not deallocated by eviction on the GPU
+			newGObj.addLock();
 			mo.setGPUObject(getGPUContext(), newGObj);
 		}
 		return mo;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 443d0eb..e0bcd1b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -42,81 +42,79 @@ public class GPUInstructionParser  extends InstructionParser
 		// Neural Network Operators
 		String2GPUInstructionType.put( "relu_backward",          GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d",                 GPUINSTRUCTION_TYPE.Convolution);
-		String2GPUInstructionType.put( "conv2d_bias_add",                 GPUINSTRUCTION_TYPE.Convolution);
+		String2GPUInstructionType.put( "conv2d_bias_add",        GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "conv2d_backward_data",   GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "maxpooling",             GPUINSTRUCTION_TYPE.Convolution);
 		String2GPUInstructionType.put( "maxpooling_backward",    GPUINSTRUCTION_TYPE.Convolution);
-		String2GPUInstructionType.put( "bias_add",    			 GPUINSTRUCTION_TYPE.Convolution);
-		String2GPUInstructionType.put( "bias_multiply",    			 GPUINSTRUCTION_TYPE.Convolution);
+		String2GPUInstructionType.put( "bias_add",               GPUINSTRUCTION_TYPE.Convolution);
+		String2GPUInstructionType.put( "bias_multiply",          GPUINSTRUCTION_TYPE.Convolution);
 
 		// Matrix Multiply Operators
-		String2GPUInstructionType.put( "ba+*",                   GPUINSTRUCTION_TYPE.AggregateBinary);
-		String2GPUInstructionType.put( "tsmm",                   GPUINSTRUCTION_TYPE.MMTSJ);
+		String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
+		String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
 
 		// Reorg/Transpose
-		String2GPUInstructionType.put( "r'",                   	 GPUINSTRUCTION_TYPE.Reorg);
+		String2GPUInstructionType.put( "r'",   GPUINSTRUCTION_TYPE.Reorg);
 	
 		// Binary Cellwise
-		String2GPUInstructionType.put( "+"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "-"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "*"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "/"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "%%"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "%/%"  , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "^"    , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "1-*"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
-		String2GPUInstructionType.put( "^2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
-		String2GPUInstructionType.put( "*2"   , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
-		String2GPUInstructionType.put( "-nz"  , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
-		String2GPUInstructionType.put( "+*"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
-		String2GPUInstructionType.put( "-*"   , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "+",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "-",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "*",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "/",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		//String2GPUInstructionType.put( "%%",   GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		//String2GPUInstructionType.put( "%/%",  GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "^",    GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "1-*",  GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+		String2GPUInstructionType.put( "^2",   GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
+		String2GPUInstructionType.put( "*2",   GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+		String2GPUInstructionType.put( "-nz",  GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
+		String2GPUInstructionType.put( "+*",   GPUINSTRUCTION_TYPE.ArithmeticBinary);
+		String2GPUInstructionType.put( "-*",   GPUINSTRUCTION_TYPE.ArithmeticBinary);
 		
-		// Builtin functions
-		String2GPUInstructionType.put( "sel+"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "exp"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "log"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "abs"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "sqrt"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "round"  , GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "floor"  , GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "ceil"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "sin"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "cos"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "tan"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "asin"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "acos"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "atan"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-		String2GPUInstructionType.put( "sign"  	, GPUINSTRUCTION_TYPE.BuiltinUnary);
-
-
-
-		String2GPUInstructionType.put( "solve"  , GPUINSTRUCTION_TYPE.BuiltinBinary);
+		// Unary Builtin functions
+		String2GPUInstructionType.put( "sel+",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "exp",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "log",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "abs",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sqrt",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "round", GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "floor", GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "ceil",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sin",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "cos",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "tan",   GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "asin",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "acos",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "atan",  GPUINSTRUCTION_TYPE.BuiltinUnary);
+		String2GPUInstructionType.put( "sign",  GPUINSTRUCTION_TYPE.BuiltinUnary);
 
+		// Binary Builtin functions
+		String2GPUInstructionType.put( "solve", GPUINSTRUCTION_TYPE.BuiltinBinary);
 
 		// Aggregate Unary
-		String2GPUInstructionType.put( "ua+"	 	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Sum
-		String2GPUInstructionType.put( "uak+"	   , GPUINSTRUCTION_TYPE.AggregateUnary);	// Sum
-		String2GPUInstructionType.put( "uar+"		 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Sum
-		String2GPUInstructionType.put( "uark+"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Sum
-		String2GPUInstructionType.put( "uac+"	 	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Sum
-		String2GPUInstructionType.put( "uack+"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Sum
-		String2GPUInstructionType.put( "ua*"	 	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Multiplication
-		String2GPUInstructionType.put( "uamean"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Mean
-		String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Mean
-		String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Mean
-		String2GPUInstructionType.put( "uamax"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Max
-		String2GPUInstructionType.put( "uarmax"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Max
-		String2GPUInstructionType.put( "uacmax"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Max
-		String2GPUInstructionType.put( "uamin"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Min
-		String2GPUInstructionType.put( "uarmin"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Min
-		String2GPUInstructionType.put( "uacmin"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Min
-		String2GPUInstructionType.put( "uasqk+"	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Sum of Squares
-		String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Sum of Squares
-		String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Sum of Squares
-		String2GPUInstructionType.put( "uavar" 	 , GPUINSTRUCTION_TYPE.AggregateUnary);	// Variance
-		String2GPUInstructionType.put( "uarvar"  , GPUINSTRUCTION_TYPE.AggregateUnary);	// Row Variance
-		String2GPUInstructionType.put( "uacvar"  , GPUINSTRUCTION_TYPE.AggregateUnary);	// Col Variance
+		String2GPUInstructionType.put( "ua+"     , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+		String2GPUInstructionType.put( "uak+"    , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+		String2GPUInstructionType.put( "uar+"    , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+		String2GPUInstructionType.put( "uark+"   , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+		String2GPUInstructionType.put( "uac+"    , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+		String2GPUInstructionType.put( "uack+"   , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+		String2GPUInstructionType.put( "ua*"     , GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication
+		String2GPUInstructionType.put( "uamean"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
+		String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
+		String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
+		String2GPUInstructionType.put( "uamax"   , GPUINSTRUCTION_TYPE.AggregateUnary); // Max
+		String2GPUInstructionType.put( "uarmax"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
+		String2GPUInstructionType.put( "uacmax"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
+		String2GPUInstructionType.put( "uamin"   , GPUINSTRUCTION_TYPE.AggregateUnary); // Min
+		String2GPUInstructionType.put( "uarmin"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
+		String2GPUInstructionType.put( "uacmin"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
+		String2GPUInstructionType.put( "uasqk+"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares
+		String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares
+		String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares
+		String2GPUInstructionType.put( "uavar"   , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance
+		String2GPUInstructionType.put( "uarvar"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance
+		String2GPUInstructionType.put( "uacvar"  , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance
 	}
 	
 	public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 673601f..89a2b67 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,25 +18,14 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-
-import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.JCuda.cudaSetDevice;
-import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusolver.cusolverSpHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
 
 import java.util.ArrayList;
 import java.util.Collections;
@@ -56,14 +45,24 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.LRUCacheMap;
 
-import jcuda.Pointer;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 
 /**
  * Represents a context per GPU accessible through the same JVM
@@ -159,6 +158,7 @@ public class GPUContext {
 
   }
 
+  @SuppressWarnings("unused")
   public int getDeviceNum() {
     return deviceNum;
   }
@@ -174,6 +174,7 @@ public class GPUContext {
     cudaSetDevice(deviceNum);
   }
 
+  @SuppressWarnings("unused")
   public static int cudaGetDevice() {
     int[] device = new int[1];
     JCuda.cudaGetDevice(device);
@@ -288,6 +289,9 @@ public class GPUContext {
    * @param eager           true if to be done eagerly
    */
   public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
+  	Pointer dummy = new Pointer();
+  	if (toFree == dummy) // trying to free a null pointer
+  		return;
     long t0 = 0;
     assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
     long size = cudaBlockSizeMap.get(toFree);
@@ -382,14 +386,14 @@ public class GPUContext {
       return;
 
     if (allocatedGPUObjects.size() == 0) {
-      throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+      throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
     }
 
     Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
       @Override
       public int compare(GPUObject p1, GPUObject p2) {
-        long p1Val = p1.readLocks.get();
-        long p2Val = p2.readLocks.get();
+        long p1Val = p1.locks.get();
+        long p2Val = p2.locks.get();
 
         if (p1Val > 0 && p2Val > 0) {
           // Both are locked, so don't sort
@@ -426,8 +430,8 @@ public class GPUContext {
 
     while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
       GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-      if (toBeRemoved.readLocks.get() > 0) {
-        throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+      if (toBeRemoved.locks.get() > 0) {
+        throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
       }
       if (toBeRemoved.dirty) {
         toBeRemoved.copyFromDeviceToHost();
@@ -546,6 +550,7 @@ public class GPUContext {
    * @return the shared memory per block
    * @throws DMLRuntimeException ?
    */
+  @SuppressWarnings("unused")
   public long getMaxSharedMemory() throws DMLRuntimeException {
     cudaDeviceProp deviceProp = getGPUProperties();
     return deviceProp.sharedMemPerBlock;
@@ -588,10 +593,10 @@ public class GPUContext {
 
   /**
    * Destroys this GPUContext object
-   * This method MUST BE called so that the GPU is available to be used again
    *
    * @throws DMLRuntimeException if error
    */
+  @SuppressWarnings("unused")
   public void destroy() throws DMLRuntimeException {
     LOG.trace("GPU : this context was destroyed, this = " + this.toString());
     clearMemory();
@@ -608,14 +613,51 @@ public class GPUContext {
 
   /**
    * Clears all memory used by this {@link GPUContext}
-   * Be careful to ensure that no memory is currently being used before invoking this
+   * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this
+   * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance
+   * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will
+   * cause the memory associated with that block on the GPU to be freed up.
    * @throws DMLRuntimeException ?
    */
   public void clearMemory() throws DMLRuntimeException {
-    while (allocatedGPUObjects.isEmpty()) {
+    clearTemporaryMemory();
+    while (!allocatedGPUObjects.isEmpty()) {
       GPUObject o = allocatedGPUObjects.get(0);
-      o.clearData();
+      if (o.isDirty()){
+        LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is still on GPU memory, copying it back to host.");
+        o.acquireHostRead();
+      }
+      o.clearData(true);
     }
+    allocatedGPUObjects.clear();
+  }
+
+  /**
+   * Clears up the memory used to optimize cudaMalloc/cudaFree calls
+   */
+  public void clearTemporaryMemory() {
+    // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
+    HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
+	  for (GPUObject o : allocatedGPUObjects) {
+		  if (o.isSparse()) {
+			  CSRPointer p = o.getSparseMatrixCudaPointer();
+			  if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
+				  tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
+			  }
+			  if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
+				  tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
+			  }
+			  if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
+				  tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
+			  }
+
+		  } else {
+			  Pointer p = o.getJcudaDenseMatrixPtr();
+			  tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
+		  }
+	  }
+
+    // garbage collect all temporarily allocated spaces
     for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
       for (Pointer p : l) {
         cudaFreeHelper(p, true);
@@ -623,7 +665,9 @@ public class GPUContext {
     }
     cudaBlockSizeMap.clear();
     freeCUDASpaceMap.clear();
-    allocatedGPUObjects.clear();
+
+    // Restore only those entries for which there are still blocks on the GPU
+    cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index be3cc09..0ed34c5 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -86,8 +86,8 @@ public class GPUObject {
 	/** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */
 	protected boolean dirty = false;
 
-	/** number of read locks on this object */
-	protected AtomicInteger readLocks = new AtomicInteger(0);
+	/** number of read/write locks on this object (this GPUObject is being used in a current instruction) */
+	protected AtomicInteger locks = new AtomicInteger(0);
 
 	/** Timestamp, needed by {@link GPUContext#evict(long)} */
 	AtomicLong timestamp = new AtomicLong(0);
@@ -112,7 +112,7 @@ public class GPUObject {
             that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
         }
 		that.dirty = me.dirty;
-		that.readLocks = new AtomicInteger(me.readLocks.get());
+		that.locks = new AtomicInteger(me.locks.get());
 		that.timestamp = new AtomicLong(me.timestamp.get());
 		that.isSparse = me.isSparse;
 
@@ -126,7 +126,7 @@ public class GPUObject {
 			cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
 		}
 
-		if (me.jcudaSparseMatrixPtr != null){
+		if (me.getJcudaSparseMatrixPtr() != null){
 			long rows = mat.getNumRows();
 			that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int)rows);
 		}
@@ -265,7 +265,6 @@ public class GPUObject {
 
 	/**
 	 * Convenience method to directly set the sparse matrix on GPU
-	 * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
 	 * Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
 	 * @param sparseMatrixPtr CSR (compressed sparse row) pointer
 	 * 
@@ -278,11 +277,11 @@ public class GPUObject {
 			cudaFreeHelper(getJcudaDenseMatrixPtr());
 			jcudaDenseMatrixPtr = null;
 		}
+		getGPUContext().recordBlockUsage(this);
 	}
 
 	/**
 	 * Convenience method to directly set the dense matrix pointer on GPU
-	 * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
 	 *
 	 * @param densePtr dense pointer
 	 * @throws DMLRuntimeException ?
@@ -294,6 +293,7 @@ public class GPUObject {
 			getJcudaSparseMatrixPtr().deallocate();
 			jcudaSparseMatrixPtr = null;
 		}
+		getGPUContext().recordBlockUsage(this);
 	}
 
 	/**
@@ -491,7 +491,6 @@ public class GPUObject {
 	public void allocateSparseAndEmpty() throws DMLRuntimeException{
 		LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext());
 		setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows()));
-		addReadLock();
 	}
 
 	/**
@@ -508,7 +507,6 @@ public class GPUObject {
 		int numElems = toIntExact(rows * cols);
 		long size = getDoubleSizeOf(numElems);
 		setDenseMatrixCudaPointer(allocate(size));
-		addReadLock();
 		// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
 		if (v != 0)
@@ -535,9 +533,8 @@ public class GPUObject {
 			LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
 			copyFromHostToDevice();
 			transferred = true;
-		} else {
-			addReadLock();
 		}
+		addLock();
 		if(!isAllocated())
 			throw new DMLRuntimeException("Expected device data to be allocated");
 		return transferred;
@@ -552,7 +549,6 @@ public class GPUObject {
 			// Dense block, size = numRows * numCols
 			allocateDenseMatrixOnDevice();
 			allocated = true;
-			getGPUContext().recordBlockUsage(this);
 		}
 		dirty = true;
 		if(!isAllocated())
@@ -569,8 +565,6 @@ public class GPUObject {
 			mat.setDirty(true);
 			allocateSparseMatrixOnDevice();
 			allocated = true;
-			getGPUContext().recordBlockUsage(this);
-
 		}
 		dirty = true;
 		if(!isAllocated())
@@ -578,8 +572,8 @@ public class GPUObject {
 		return allocated;
 	}
 
-	public void addReadLock() {
-		readLocks.addAndGet(1);
+	public void addLock() {
+		locks.addAndGet(1);
 	}
 
 	/**
@@ -606,11 +600,13 @@ public class GPUObject {
 	 * Updates the locks depending on the eviction policy selected
 	 * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
-	private void updateReleaseLocks() throws DMLRuntimeException {
-		if (readLocks.addAndGet(-1) < 0) {
-			throw new CacheException("Redundant release of GPU object");
+	private void updateReleaseLocks(int l) throws DMLRuntimeException {
+		int newLocks = locks.addAndGet(l);
+		if (newLocks < 0) {
+			throw new CacheException("Internal state error : Invalid number of locks on a GPUObject");
 		}
-		LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+
+		LOG.trace("GPU : updateReleaseLocks, new number of locks is " + locks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
 		GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
 		switch (evictionPolicy){
 			case LRU : timestamp.set(System.nanoTime()); break;
@@ -625,7 +621,8 @@ public class GPUObject {
 	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
 	public void releaseInput() throws DMLRuntimeException {
-		updateReleaseLocks();
+		// A read lock is a positive quantity, therefor when the lock is freed, a negative 1 is added
+		updateReleaseLocks(-1);
 		if(!isAllocated())
 			throw new CacheException("Attempting to release an input before allocating it");
 	}
@@ -635,7 +632,8 @@ public class GPUObject {
 	 * @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
 	public void releaseOutput() throws DMLRuntimeException {
-		updateReleaseLocks();
+		// A write lock is a negative quantity, therefore when the lock is freed, a positive number is added
+		updateReleaseLocks(1);
 		dirty = true;
 		if(!isAllocated())
 			throw new CacheException("Attempting to release an output before allocating it");
@@ -651,7 +649,6 @@ public class GPUObject {
 		long size = getDoubleSizeOf(rows * cols);
 		Pointer tmp = allocate(size);
 		setDenseMatrixCudaPointer(tmp);
-		addReadLock();
 	}
 
 	void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
@@ -660,10 +657,9 @@ public class GPUObject {
 		long rows = mat.getNumRows();
 		long nnz = mat.getNnz();
 		assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
-		assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+		assert nnz >= 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
 		CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows);
 		setSparseMatrixCudaPointer(tmp);
-		addReadLock();
 	}
 
 	void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException {
@@ -680,7 +676,8 @@ public class GPUObject {
 			cudnnDestroyTensorDescriptor(tensorDescriptor);
 			tensorDescriptor = null;
 		}
-		readLocks.set(0);
+		locks.set(0);
+		getGPUContext().removeRecordedUsage(this);
 	}
 
 	protected long getSizeOnDevice() throws DMLRuntimeException {
@@ -754,8 +751,8 @@ public class GPUObject {
 				colInd = csrBlock.indexes();
 				values = csrBlock.values();
 			}
+
 			allocateSparseMatrixOnDevice();
-			getGPUContext().recordBlockUsage(this);
 
 			if(copyToDevice) {
 				CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
@@ -773,7 +770,6 @@ public class GPUObject {
 
 			// Copy dense block
 			allocateDenseMatrixOnDevice();
-			getGPUContext().recordBlockUsage(this);
 
 			cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
 		}
@@ -862,9 +858,8 @@ public class GPUObject {
 	 * @throws CacheException ?
 	 */
 	public void clearData(boolean eager) throws DMLRuntimeException {
-		getGPUContext().removeRecordedUsage(this);
 		deallocateMemoryOnDevice(eager);
-
+		getGPUContext().removeRecordedUsage(this);
 	}
 
 	/** 
@@ -896,7 +891,7 @@ public class GPUObject {
 		final StringBuilder sb = new StringBuilder("GPUObject{");
 		sb.append(", tensorShape=").append(Arrays.toString(tensorShape));
 		sb.append(", dirty=").append(dirty);
-		sb.append(", readLocks=").append(readLocks);
+		sb.append(", locks=").append(locks);
 		sb.append(", sparse? ").append(isSparse);
 		sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
 		sb.append('}');

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index ac11df9..4d06831 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -34,6 +34,7 @@ import jcuda.Pointer;
 import jcuda.driver.CUfunction;
 import jcuda.driver.CUmodule;
 import jcuda.driver.CUresult;
+import jcuda.runtime.JCuda;
 
 /**
  * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke custom CUDA kernels.
@@ -111,7 +112,7 @@ public class JCudaKernels {
 				config.gridDimX, config.gridDimY, config.gridDimZ, 
 				config.blockDimX, config.blockDimY, config.blockDimZ, 
 				config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null));
-		//JCuda.cudaDeviceSynchronize();
+        JCuda.cudaDeviceSynchronize();
 	}
 	
     public static void checkResult(int cuResult) throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index b023159..7990fef 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -235,8 +235,8 @@ public class LibMatrixCUDA {
 
 
 	private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-	
-	private static Pointer _one; 
+
+	private static Pointer _one;
 	private static Pointer _zero;
 	/**
 	 * Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
@@ -258,7 +258,7 @@ public class LibMatrixCUDA {
 		}
 		return _zero;
 	}
-	
+
 	/**
 	 * Convenience method to get tensor descriptor from underlying GPUObject
 	 * @param gCtx   a valid {@link GPUContext}
@@ -272,7 +272,7 @@ public class LibMatrixCUDA {
 	 */
 	private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
 		if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
-			throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N 
+			throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
 					+ " || " + mat.getNumColumns() + " != " + (C*H*W));
 		}
 		return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
@@ -293,7 +293,7 @@ public class LibMatrixCUDA {
 		cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
 		return tensorDescriptor;
 	}
-	
+
 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 * @param gCtx a valid {@link GPUContext}
@@ -308,7 +308,7 @@ public class LibMatrixCUDA {
 		}
 		return getDensePointer(gCtx, image, instName);
 	}
-	
+
 	/**
 	 * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
 	 * @param gCtx a valid {@link GPUContext}
@@ -339,15 +339,15 @@ public class LibMatrixCUDA {
 		}
 		return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 	}
-	
+
 	/**
 	 * Convenience method for checking the status of CuDNN kernel.
-	 * 
+	 *
 	 * @param status status returned by CuDNN
 	 * @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
 	 */
 	private static void checkStatus(int status) throws DMLRuntimeException {
-		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) 
+		if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
 			throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
 	}
 
@@ -383,7 +383,7 @@ public class LibMatrixCUDA {
 		//cudaDeviceSynchronize;
 		biasAdd(gCtx, instName, outputBlock, bias, outputBlock);
 	}
-	
+
 	public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
 														int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
 					throws DMLRuntimeException {
@@ -530,7 +530,7 @@ public class LibMatrixCUDA {
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
 	}
-	
+
 	/**
 	 * Performs the operation corresponding to the DML script:
 	 * ones = matrix(1, rows=1, cols=Hout*Wout)
@@ -635,7 +635,7 @@ public class LibMatrixCUDA {
 			throw new DMLRuntimeException("Incorrect dimensions for running variance");
 		}
 	}
-	
+
 	/**
 	 * Performs the forward BatchNormalization layer computation for inference
 	 * @param gCtx   a valid {@link GPUContext}
@@ -650,21 +650,21 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
+			MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, double epsilon) throws DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-		
+
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
 		long CHW = image.getNumColumns();
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-		
+
 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-		
+
 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
 		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -672,13 +672,13 @@ public class LibMatrixCUDA {
 		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
 		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
 		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-		
+
 		checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 			scaleTensorDesc, scalePtr, biasPtr,
 			runningMeanPtr, runningVarPtr, epsilon));
 	}
-	
+
 	/**
 	 * Performs the forward BatchNormalization layer computation for training
 	 * @param gCtx   a valid {@link GPUContext}
@@ -696,21 +696,21 @@ public class LibMatrixCUDA {
 	 * @throws DMLRuntimeException if error occurs
 	 */
 	public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
-			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, 
+			MatrixObject scale,  MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
 			MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-		
+
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
 		long CHW = image.getNumColumns();
 		validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-		
+
 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image},  new MatrixObject[] {ret});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-		
+
 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
 		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -718,20 +718,20 @@ public class LibMatrixCUDA {
 		Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
 		Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
 		Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-		
+
 		// To allow for copy-on-write
 		Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName);
 		Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName);
 		cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
 		cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-		
+
 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
 		checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
 				nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
 			scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
 			retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
 	}
-	
+
 	/**
 	 * Convenient utility for batch normalization that returns a NCHW descriptor
 	 * @param gCtx a valid {@link GPUContext}
@@ -776,8 +776,8 @@ public class LibMatrixCUDA {
 			H = HW; W = 1; // If not known
 			double potentialH = Math.sqrt(HW);
 			if(potentialH == ((int) potentialH)) {
-				H = (int) potentialH; 
-				W = H; 
+				H = (int) potentialH;
+				W = H;
 			}
 			// We are not sure about H and W, hence don't allocate them.
 			ret = new cudnnTensorDescriptor();
@@ -786,7 +786,7 @@ public class LibMatrixCUDA {
 		}
 		return ret;
 	}
-	
+
 	/**
 	 * This method computes the backpropagation errors for image, scale and bias of batch normalization layer
 	 * @param gCtx   a valid {@link GPUContext}
@@ -805,16 +805,16 @@ public class LibMatrixCUDA {
 			double epsilon) throws DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-		
+
 		int N = (int) image.getNumRows();
 		int C = (int) scale.getNumColumns();
 		long CHW = image.getNumColumns();
-		
+
 		// Allocate descriptors
 		cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
 				new MatrixObject[] {image, dout},  new MatrixObject[] {ret});
 		cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-		
+
 		// Get underlying dense pointer
 		Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
 		Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
@@ -822,7 +822,7 @@ public class LibMatrixCUDA {
 		Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
 		Pointer retScalePtr = getDensePointer(gCtx, retScale, true, instName);
 		Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, instName);
-		
+
 		// ignoring resultSaveMean and resultSaveVariance as it requires state management
 		checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), zero(), one(), zero(),
 				nCHWDescriptor,  imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
@@ -915,7 +915,7 @@ public class LibMatrixCUDA {
 		}
 	}
 
-	private static long numDoublesIn2GB = 125000000;
+	private static long numDoublesIn2GB = 268435456;
 
 	/**
 	 * This method computes the backpropogation errors for previous layer of convolution operation
@@ -961,7 +961,7 @@ public class LibMatrixCUDA {
 			Pointer w = getDensePointer(gCtx, filter, true, instName);
 			Pointer dy = getDensePointer(gCtx, dout, true, instName);
 			Pointer dx = getDensePointer(gCtx, output, true, instName);
-			
+
 			int padding [] = { pad_h, pad_w };
 			int strides [] = { stride_h, stride_w };
 			convDesc = allocateConvolutionDescriptor(padding, strides);
@@ -999,7 +999,7 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
 		}
 	}
-	
+
 	/**
 	 * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
 	 * @param gCtx   a valid {@link GPUContext}
@@ -1029,7 +1029,7 @@ public class LibMatrixCUDA {
 		cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
 		performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 	}
-	
+
 	public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc,
 			MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
 			int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
@@ -1064,7 +1064,7 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
 		}
 	}
-	
+
 	/**
 	 * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
 	 * This method computes the backpropogation errors for previous layer of maxpooling operation
@@ -1149,13 +1149,13 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
 		}
 	}
-	
+
 	private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
 		long t0=0;
 		try {
             LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
             cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
-			
+
 			Pointer srcData = getDensePointer(gCtx, in, true, instName);
 			cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
 			cudnnCreateActivationDescriptor(activationDescriptor);
@@ -1175,7 +1175,7 @@ public class LibMatrixCUDA {
 			if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
 		}
 	}
-	
+
 
 	/**
 	 * Performs the relu operation on the GPU.
@@ -1580,7 +1580,6 @@ public class LibMatrixCUDA {
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
 
 		output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
-		output.getGPUObject(gCtx).addReadLock();
 	}
 
 	/**
@@ -1671,7 +1670,6 @@ public class LibMatrixCUDA {
 		if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
 
 		output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-		output.getGPUObject(gCtx).addReadLock();
 
 		if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
 		cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
@@ -2079,7 +2077,7 @@ public class LibMatrixCUDA {
 						reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
 						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
-						matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, clen, out, divideOp);
+						matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
 
 						gCtx.cudaFreeHelper(instName, tmpRow);
 
@@ -2097,7 +2095,7 @@ public class LibMatrixCUDA {
 						reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
 						ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
-						matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, rlen, clen, out, divideOp);
+						matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
 
 						gCtx.cudaFreeHelper(instName, tmpCol);
 
@@ -2348,14 +2346,21 @@ public class LibMatrixCUDA {
 				else if(op.fn instanceof Power) {
 					setOutputToConstant(ec, gCtx, instName, 1.0, outputName);
 				}
-				else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
-					setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
-				}
-				else if(op.fn instanceof Divide) {
-					//For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
-					compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
-				}
-				else {
+                // TODO:
+                // x/0.0 is either +Infinity or -Infinity according to Java.
+                // In the context of a matrix, different elements of the matrix
+                // could have different values.
+                // If the IEEE 754 standard defines otherwise, this logic needs
+                // to be re-enabled and the Java computation logic for divide by zero
+                // needs to be revisited
+                //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
+                //	setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
+                //}
+                //else if(op.fn instanceof Divide) {
+                //	//For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
+                //	compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
+                //}
+                else {
 					// TODO: Potential to optimize
 					matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
 				}
@@ -2790,7 +2795,6 @@ public class LibMatrixCUDA {
 			CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 
 			ec.allocateGPUMatrixObject(outputName);
-			out.getGPUObject(gCtx).addReadLock();
 
 			if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
 				// Special case for transpose
@@ -3160,7 +3164,7 @@ public class LibMatrixCUDA {
 		MatrixObject out = ec.getMatrixObject(outputName);
 		getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);	// Allocated the dense output matrix
 		Pointer C = getDensePointer(gCtx, out, instName);
-		
+
 		long t1=0, t2=0;
 		if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
             LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
index fb62c41..bb0a9a8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
@@ -45,9 +45,6 @@ public class LibMatrixDatagen
 {
 	private static final Log LOG = LogFactory.getLog(LibMatrixDatagen.class.getName());
 	private static final long PAR_NUMCELL_THRESHOLD = 512*1024; //Min 500k elements
-	public static final String RAND_PDF_UNIFORM = "uniform";
-	public static final String RAND_PDF_NORMAL = "normal";
-	public static final String RAND_PDF_POISSON = "poisson";
 	
 	private static IDSequence _seqRandInput = new IDSequence(); 
 	
@@ -55,9 +52,9 @@ public class LibMatrixDatagen
 		//prevent instantiation via private constructor
 	}
 
-	public static boolean isShortcutRandOperation( double min, double max, double sparsity, String pdf )
+	public static boolean isShortcutRandOperation( double min, double max, double sparsity, RandomMatrixGenerator.PDF pdf )
 	{
-		return pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)
+		return pdf == RandomMatrixGenerator.PDF.UNIFORM
 			   && (  ( min == 0.0 && max == 0.0 ) //all zeros
 				   ||( sparsity==1.0d && min == max )); //equal values
 	}
@@ -137,27 +134,31 @@ public class LibMatrixDatagen
 		}
 	}
 
-    public static RandomMatrixGenerator createRandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams) 
+    public static RandomMatrixGenerator createRandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams)
     	throws DMLRuntimeException
     {
-    	RandomMatrixGenerator rgen = null;
-    	
-    	if ( pdf.equalsIgnoreCase(RAND_PDF_UNIFORM))
-    		rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max);
-    	else if ( pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
-    		rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp);
-    	else if ( pdf.equalsIgnoreCase(RAND_PDF_POISSON))
-    	{
-    		double mean = Double.NaN;
-    		try {
-    			mean = Double.parseDouble(distParams);
-    		} catch(NumberFormatException e) {
-    			throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams);
-    		}
-    		rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean);
-    	}
-    	else
-    		throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\"");
+		RandomMatrixGenerator.PDF pdf = RandomMatrixGenerator.PDF.valueOf(pdfStr.toUpperCase());
+		RandomMatrixGenerator rgen = null;
+    	switch (pdf) {
+		case UNIFORM:
+			rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max);
+			break;
+		case NORMAL:
+			rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp);
+			break;
+		case POISSON:
+			double mean = Double.NaN;
+			try {
+				mean = Double.parseDouble(distParams);
+			} catch (NumberFormatException e) {
+				throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams);
+			}
+			rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean);
+			break;
+		default:
+			throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\"");
+
+		}
     	return rgen;
     }
 	
@@ -202,11 +203,11 @@ public class LibMatrixDatagen
 		 * (max-min)*prng.nextDouble() is still valid. This is done primarily to
 		 * share the same code across different distributions.
 		 */
-		double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
-		double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+		double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+		double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
 		
 		// Special case shortcuts for efficiency
-		if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+		if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
 			if ( min == 0.0 && max == 0.0 ) { //all zeros
 				out.reset(rows, cols, true);
 				return;
@@ -288,8 +289,8 @@ public class LibMatrixDatagen
 		 * (max-min)*prng.nextDouble() is still valid. This is done primarily to
 		 * share the same code across different distributions.
 		 */
-		double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
-		double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+		double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+		double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
 		
 		//determine the sparsity of output matrix (multi-threaded always invoked from CP):
 		//estimated NNZ is for entire matrix (nnz=0, if 0 initialized)
@@ -304,7 +305,7 @@ public class LibMatrixDatagen
 		}
 
 		//special case shortcuts for efficiency
-		if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+		if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
 			if ( min == 0.0 && max == 0.0 ) { //all zeros
 				out.reset(rows, cols, false);
 				return;
@@ -497,8 +498,8 @@ public class LibMatrixDatagen
 		int cpb = rgen._colsPerBlock;
 		double sparsity = rgen._sparsity;
 		PRNGenerator valuePRNG = rgen._valuePRNG;
-		double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
-		double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+		double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+		double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
 		double range = max - min;
 		int clen = out.clen;
 		int estimatedNNzsPerRow = out.estimatedNNzsPerRow;
@@ -510,14 +511,19 @@ public class LibMatrixDatagen
 
 		// Setup Pseudo Random Number Generator for cell values based on 'pdf'.
 		if (valuePRNG == null) {
-			if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) 
+			switch (rgen._pdf) {
+			case UNIFORM:
 				valuePRNG = new UniformPRNGenerator();
-			else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
+				break;
+			case NORMAL:
 				valuePRNG = new NormalPRNGenerator();
-			else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_POISSON))
+				break;
+			case POISSON:
 				valuePRNG = new PoissonPRNGenerator();
-			else
+				break;
+			default:
 				throw new DMLRuntimeException("Unsupported distribution function for Rand: " + rgen._pdf);
+			}
 		}
 		
 		// loop through row-block indices

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
index 1dc818c..b4c6c95 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
@@ -26,8 +26,15 @@ import org.apache.sysml.runtime.util.PoissonPRNGenerator;
 import org.apache.sysml.runtime.util.UniformPRNGenerator;
 
 public class RandomMatrixGenerator {
-	
-	String _pdf;
+
+	/**
+	 * Types of Probability density functions
+	 */
+	enum PDF {
+		NORMAL, UNIFORM, POISSON
+	}
+
+	PDF _pdf;
 	int _rows, _cols, _rowsPerBlock, _colsPerBlock;
 	double _sparsity, _mean; 
 	double _min, _max; 
@@ -36,25 +43,76 @@ public class RandomMatrixGenerator {
 
 	public RandomMatrixGenerator() 
 	{
-		_pdf = "";
+		_pdf = PDF.UNIFORM;
 		_rows = _cols = _rowsPerBlock = _colsPerBlock = -1;
 		_sparsity = 0.0;
 		_min = _max = Double.NaN;
 		_valuePRNG = null;
 		_mean = 1.0;
 	}
-	
-	public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException 
+
+	/**
+	 * Instantiates a Random number generator
+	 * @param pdf    probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @throws DMLRuntimeException if error
+	 */
+	public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException
 	{
 		this(pdf, r, c, rpb, cpb, sp, Double.NaN, Double.NaN);
 	}
-	
-	public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException 
+
+	/**
+	 * Instantiates a Random number generator
+	 * @param pdfStr probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @param min    minimum of range of random numbers
+	 * @param max    maximum of range of random numbers
+	 * @throws DMLRuntimeException if error
+	 */
+	public RandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
+	{
+		init(PDF.valueOf(pdfStr.toUpperCase()), r, c, rpb, cpb, sp, min, max);
+	}
+
+	/**
+	 * Instantiates a Random number generator
+	 * @param pdf    probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @param min    minimum of range of random numbers
+	 * @param max    maximum of range of random numbers
+	 * @throws DMLRuntimeException if error
+	 */
+	public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
 	{
 		init(pdf, r, c, rpb, cpb, sp, min, max);
 	}
-	
-	public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException 
+
+	/**
+	 * Initializes internal data structures. Called by Constructor
+	 * @param pdf    probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @param min    minimum of range of random numbers
+	 * @param max    maximum of range of random numbers
+	 * @throws DMLRuntimeException if error
+	 */
+	public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
 	{
 		_pdf = pdf;
 		_rows = r;
@@ -67,13 +125,39 @@ public class RandomMatrixGenerator {
 		
 		setupValuePRNG();
 	}
-	
-	public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException 
+
+	/**
+	 * Instantiates a Random number generator with a specific poisson mean
+	 * @param pdf    probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @param min    minimum of range of random numbers
+	 * @param max    maximum of range of random numbers
+	 * @param mean   the poisson mean
+	 * @throws DMLRuntimeException if error
+	 */
+	public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
 	{
 		init(pdf, r, c, rpb, cpb, sp, min, max, mean);
 	}
-	
-	public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException 
+
+	/**
+	 * Instantiates a Random number generator with a specific poisson mean
+	 * @param pdf    probability density function
+	 * @param r      number of rows
+	 * @param c      number of columns
+	 * @param rpb    rows per block
+	 * @param cpb    columns per block
+	 * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+	 * @param min    minimum of range of random numbers
+	 * @param max    maximum of range of random numbers
+	 * @param mean   the poisson mean
+	 * @throws DMLRuntimeException if error
+	 */
+	public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
 	{
 		_pdf = pdf;
 		_rows = r;
@@ -89,15 +173,20 @@ public class RandomMatrixGenerator {
 	
 	protected void setupValuePRNG() throws DMLRuntimeException 
 	{
-		if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_NORMAL) ) 
+		switch (_pdf) {
+		case NORMAL:
 			_valuePRNG = new NormalPRNGenerator();
-		else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_UNIFORM) ) 
+			break;
+		case UNIFORM:
 			_valuePRNG = new UniformPRNGenerator();
-		else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_POISSON) ) 
-		{
+			break;
+		case POISSON:
 			if(_mean <= 0)
 				throw new DMLRuntimeException("Invalid parameter (" + _mean + ") for Poisson distribution.");
 			_valuePRNG = new PoissonPRNGenerator(_mean);
+			break;
+		default:
+			throw new DMLRuntimeException("Unsupported probability density function");
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
new file mode 100644
index 0000000..0b229f0
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests Aggregate Unary ops
+ */
+public class AggregateUnaryOpTests extends UnaryOpTestsBase {
+
+	private final static String TEST_NAME = "AggregateUnaryOpTests";
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void sum() {
+		testSimpleUnaryOpMatrixOutput("sum", "gpu_uak+");
+	}
+
+	@Test
+	public void colSums() {
+		testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+");
+	}
+
+	@Test
+	public void rowSums() {
+		testSimpleUnaryOpMatrixOutput("rowSums", "gpu_uark+");
+	}
+
+	@Test
+	public void mult() {
+		testSimpleUnaryOpMatrixOutput("prod", "gpu_ua*");
+	}
+
+	@Test
+	public void mean() {
+		testSimpleUnaryOpMatrixOutput("mean", "gpu_uamean");
+	}
+
+	@Test
+	public void colMeans() {
+		testSimpleUnaryOpMatrixOutput("colMeans", "gpu_uacmean");
+	}
+
+	@Test
+	public void rowMeans() {
+		testSimpleUnaryOpMatrixOutput("rowMeans", "gpu_uarmean");
+	}
+
+	@Test
+	public void max() {
+		testSimpleUnaryOpMatrixOutput("max", "gpu_uamax");
+	}
+
+	@Test
+	public void rowMaxs() {
+		testSimpleUnaryOpMatrixOutput("rowMaxs", "gpu_uarmax");
+	}
+
+	@Test
+	public void colMaxs() {
+		testSimpleUnaryOpMatrixOutput("colMaxs", "gpu_uacmax");
+	}
+
+	@Test
+	public void min() {
+		testSimpleUnaryOpMatrixOutput("min", "gpu_uamin");
+	}
+
+	@Test
+	public void rowMins() {
+		testSimpleUnaryOpMatrixOutput("rowMins", "gpu_uarmin");
+	}
+
+	@Test
+	public void colMins() {
+		testSimpleUnaryOpMatrixOutput("colMins", "gpu_uacmin");
+	}
+
+	@Test
+	public void var() {
+		testSimpleUnaryOpMatrixOutput("var", "gpu_uavar");
+	}
+
+	@Test
+	public void colVars() {
+		testSimpleUnaryOpMatrixOutput("colVars", "gpu_uacvar");
+	}
+
+	@Test
+	public void rowVars() {
+		testSimpleUnaryOpMatrixOutput("rowVars", "gpu_uarvar");
+	}
+
+	@Test
+	public void sumsq() {
+		testUnaryOpMatrixOutput("out = sum(in1*in1)", "gpu_uasqk+", "in1", "out");
+	}
+
+	@Test
+	public void rowSumsqs() {
+		testUnaryOpMatrixOutput("out = rowSums(in1*in1)", "gpu_uarsqk+", "in1", "out");
+	}
+
+	@Test
+	public void colSumsqs() {
+		testUnaryOpMatrixOutput("out = colSums(in1*in1)", "gpu_uacsqk+", "in1", "out");
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
new file mode 100644
index 0000000..f3d2b21
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests builtin binary ops on GPU
+ */
+public class BinaryOpTests extends GPUTests {
+
+	private final static String TEST_NAME = "BinaryOpTests";
+	private final int seed = 42;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_DIR, TEST_NAME);
+		getAndLoadTestConfiguration(TEST_NAME);
+	}
+
+	@Test
+	public void testSolve() {
+		// Test Ax = b
+		// Dimensions of A = m * n
+		// Dimensions of x = n * 1
+		// Dimensions of b = m * 1
+
+		double sparsity = 1.0; // Only dense matrices supported by "solve"
+		final int[] sides = { 32, 33, 128, 256, 513, 2049 };
+		for (int i = 0; i < sides.length; i++) {
+			for (int j = i; j < sides.length; j++) {
+				int m = sides[j];
+				int n = sides[i];
+				runSolveTest(sparsity, m, n);
+			}
+		}
+
+	}
+
+	/**
+	 * Runs the test for solve (Ax = b) for input with given dimensions and sparsities
+	 * A can be overdetermined (rows in A > columns in A)
+	 *
+	 * @param sparsity sparsity for the block A and b
+	 * @param m        rows in A
+	 * @param n        columns in A
+	 */
+	protected void runSolveTest(double sparsity, int m, int n) {
+		String scriptStr = "x = solve(A, b)";
+		System.out.println("In solve, A[" + m + ", " + n + "], b[" + m + ", 1]");
+		Matrix A = generateInputMatrix(spark, m, n, sparsity, seed);
+		Matrix b = generateInputMatrix(spark, m, 1, sparsity, seed);
+		HashMap<String, Object> inputs = new HashMap<>();
+		inputs.put("A", A);
+		inputs.put("b", b);
+		List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList("x"));
+		List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList("x"));
+		assertHeavyHitterPresent("gpu_solve");
+		assertEqualObjects(outCPU.get(0), outGPU.get(0));
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
new file mode 100644
index 0000000..8af9104
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.spark.sql.SparkSession;
+import org.apache.sysml.api.mlcontext.MLContext;
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.api.mlcontext.Script;
+import org.apache.sysml.api.mlcontext.ScriptFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.utils.Statistics;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+
+/**
+ * Parent class for all GPU tests
+ */
+public abstract class GPUTests extends AutomatedTestBase {
+
+	protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext";
+	protected static SparkSession spark;
+	protected final double THRESHOLD = 1e-9;    // for relative error
+
+	@BeforeClass
+	public static void beforeClass() {
+		spark = createSystemMLSparkSession("GPUTests", "local");
+	}
+
+	@AfterClass
+	public static void afterClass() {
+		spark.close();
+	}
+
+	/**
+	 * Gets threshold for relative error in tests
+	 *
+	 * @return a valid threshold
+	 */
+	protected double getTHRESHOLD() {
+		return THRESHOLD;
+	}
+
+	@After
+	public void tearDown() {
+		clearGPUMemory();
+		super.tearDown();
+	}
+
+	/**
+	 * Clear out the memory on all GPUs
+	 */
+	protected void clearGPUMemory() {
+		try {
+			int count = GPUContextPool.getDeviceCount();
+			int freeCount = GPUContextPool.getAvailableCount();
+			Assert.assertTrue("All GPUContexts have not been returned to the GPUContextPool", count == freeCount);
+			ArrayList<GPUContext> gpuContexts = new ArrayList<>();
+			for (int i = 0; i < count; i++) {
+				GPUContext gCtx = GPUContextPool.getFromPool();
+				gCtx.initializeThread();
+				gCtx.clearMemory();
+				gpuContexts.add(gCtx);
+			}
+			for (GPUContext gCtx : gpuContexts) {
+				GPUContextPool.returnToPool(gCtx);
+			}
+
+		} catch (DMLRuntimeException e) {
+			// Ignore
+		}
+	}
+
+	/**
+	 * Generates a random input matrix with a given size and sparsity
+	 *
+	 * @param spark    valid instance of {@link SparkSession}
+	 * @param m        number of rows
+	 * @param n        number of columns
+	 * @param sparsity sparsity (1 = completely dense, 0 = completely sparse)
+	 * @return a random matrix with given size and sparsity
+	 */
+	protected Matrix generateInputMatrix(SparkSession spark, int m, int n, double sparsity, int seed) {
+		// Generate a random matrix of size m * n
+		MLContext genMLC = new MLContext(spark);
+		String scriptStr;
+		if (sparsity == 0.0) {
+			scriptStr = "in1 = matrix(0, rows=" + m + ", cols=" + n + ")";
+		} else {
+			scriptStr = "in1 = rand(rows=" + m + ", cols=" + n + ", sparsity = " + sparsity + ", seed= " + seed
+					+ ", min=-1.0, max=1.0)";
+		}
+		Script generateScript = ScriptFactory.dmlFromString(scriptStr).out("in1");
+		Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
+		genMLC.close();
+		return in1;
+	}
+
+	/**
+	 * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other
+	 *
+	 * @param expected expected matrix
+	 * @param actual   actual matrix
+	 */
+	private void assertEqualMatrices(Matrix expected, Matrix actual) {
+		try {
+			MatrixBlock expectedMB = expected.toMatrixObject().acquireRead();
+			MatrixBlock actualMB = actual.toMatrixObject().acquireRead();
+
+			long rows = expectedMB.getNumRows();
+			long cols = expectedMB.getNumColumns();
+			Assert.assertEquals(rows, actualMB.getNumRows());
+			Assert.assertEquals(cols, actualMB.getNumColumns());
+
+			for (int i = 0; i < rows; i++) {
+				for (int j = 0; j < cols; j++) {
+					double expectedDouble = expectedMB.quickGetValue(i, j);
+					double actualDouble = actualMB.quickGetValue(i, j);
+					if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+						double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+						Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError
+								+ ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+					} else {
+						Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
+					}
+				}
+			}
+			expected.toMatrixObject().release();
+			actual.toMatrixObject().release();
+		} catch (DMLRuntimeException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	/**
+	 * asserts that the expected op was executed
+	 *
+	 * @param heavyHitterOpCode opcode of the heavy hitter for the unary op
+	 */
+	protected void assertHeavyHitterPresent(String heavyHitterOpCode) {
+		Set<String> heavyHitterOpCodes = Statistics.getCPHeavyHitterOpCodes();
+		Assert.assertTrue(heavyHitterOpCodes.contains(heavyHitterOpCode));
+	}
+
+	/**
+	 * Runs a program on the CPU
+	 *
+	 * @param spark     a valid {@link SparkSession}
+	 * @param scriptStr the script to run (as a string)
+	 * @param inputs    map of input variables names in the scriptStr (of variable_name -> object)
+	 * @param outStrs   list of variable names needed as output from the scriptStr
+	 * @return list of output objects in order of outStrs
+	 */
+	protected List<Object> runOnCPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
+			List<String> outStrs) {
+		MLContext cpuMLC = new MLContext(spark);
+		List<Object> outputs = new ArrayList<>();
+		Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+		for (String outStr : outStrs) {
+			Object output = cpuMLC.execute(script).get(outStr);
+			outputs.add(output);
+		}
+		cpuMLC.close();
+		return outputs;
+	}
+
+	/**
+	 * Runs a program on the GPU
+	 *
+	 * @param spark     a valid {@link SparkSession}
+	 * @param scriptStr the script to run (as a string)
+	 * @param inputs    map of input variables names in the scriptStr (of variable_name -> object)
+	 * @param outStrs   list of variable names needed as output from the scriptStr
+	 * @return list of output objects in order of outStrs
+	 */
+	protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
+			List<String> outStrs) {
+		MLContext gpuMLC = new MLContext(spark);
+		gpuMLC.setGPU(true);
+		gpuMLC.setForceGPU(true);
+		gpuMLC.setStatistics(true);
+		List<Object> outputs = new ArrayList<>();
+		Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+		for (String outStr : outStrs) {
+			Object output = gpuMLC.execute(script).get(outStr);
+			outputs.add(output);
+		}
+		gpuMLC.close();
+		return outputs;
+	}
+
+	/**
+	 * Assert that the two objects are equal. Supported types are Boolean, Integer, String, Double and Matrix
+	 *
+	 * @param expected
+	 * @param actual
+	 */
+	protected void assertEqualObjects(Object expected, Object actual) {
+		Assert.assertEquals(expected.getClass(), actual.getClass());
+
+		if (expected instanceof Boolean) {
+			Assert.assertEquals(((Boolean) expected).booleanValue(), ((Boolean) actual).booleanValue());
+		} else if (expected instanceof Double) {
+			double expectedDouble = ((Double) expected).doubleValue();
+			double actualDouble = ((Double) actual).doubleValue();
+			if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+				double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+				Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError
+						+ ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+			} else {
+				Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
+			}
+		} else if (expected instanceof String) {
+			Assert.assertEquals(expected.toString(), actual.toString());
+		} else if (expected instanceof Integer) {
+			Assert.assertEquals(((Integer) expected).intValue(), ((Integer) actual).intValue());
+		} else if (expected instanceof Matrix)
+			assertEqualMatrices((Matrix) expected, (Matrix) actual);
+		else {
+			Assert.fail("Invalid types for comparison");
+		}
+	}
+}