You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/06/01 04:54:42 UTC
[1/2] incubator-systemml git commit: [SYSTEMML-1625] GPU Unit Tests
(and GPU row/col variance bug fix)
Repository: incubator-systemml
Updated Branches:
refs/heads/master ceeec4bbf -> 772fb5883
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
new file mode 100644
index 0000000..4052fef
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMatrixElementWiseOpTests.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Test Elementwise operations on the GPU
+ */
+public class MatrixMatrixElementWiseOpTests extends GPUTests {
+ private final static String TEST_NAME = "MatrixMatrixElementWiseOpTests";
+
+ private final int[] rowSizes = new int[] { 1, 64, 130, 1024, 2049 };
+ private final int[] columnSizes = new int[] { 1, 64, 130, 1024, 2049 };
+ private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+ private final double[] scalars = new double[] { 0.0, 0.5, 2.0 };
+ private final int seed = 42;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void testAxpy() {
+ runAxpyTest("O = a*X + Y", "X", "Y", "a", "O", "gpu_-*");
+ }
+
+ @Test
+ public void testAxmy() {
+ runAxpyTest("O = X - a*Y", "X", "Y", "a", "O", "gpu_+*");
+ }
+
+ @Test
+ public void testAdd() {
+ runMatrixMatrixElementwiseTest("O = X + Y", "X", "Y", "O", "gpu_+");
+ }
+
+ @Test
+ public void testMatrixColumnVectorAdd() {
+ runMatrixColumnVectorTest("O = X + Y", "X", "Y", "O", "gpu_+");
+ }
+
+ @Test
+ public void testMatrixRowVectorAdd() {
+ runMatrixRowVectorTest("O = X + Y", "X", "Y", "O", "gpu_+");
+ }
+
+ @Test
+ public void testSubtract() {
+ runMatrixMatrixElementwiseTest("O = X - Y", "X", "Y", "O", "gpu_-");
+ }
+
+ @Test
+ public void testMatrixColumnVectorSubtract() {
+ runMatrixColumnVectorTest("O = X - Y", "X", "Y", "O", "gpu_-");
+ }
+
+ @Test
+ public void testMatrixRowVectorSubtract() {
+ runMatrixRowVectorTest("O = X - Y", "X", "Y", "O", "gpu_-");
+ }
+
+ @Test
+ public void testMultiply() {
+ runMatrixMatrixElementwiseTest("O = X * Y", "X", "Y", "O", "gpu_*");
+ }
+
+ @Test
+ public void testMatrixColumnVectorMultiply() {
+ runMatrixColumnVectorTest("O = X * Y", "X", "Y", "O", "gpu_*");
+ }
+
+ @Test
+ public void testMatrixRowVectorMultiply() {
+ runMatrixRowVectorTest("O = X * Y", "X", "Y", "O", "gpu_*");
+ }
+
+ @Test
+ public void testDivide() {
+ runMatrixMatrixElementwiseTest("O = X / Y", "X", "Y", "O", "gpu_/");
+ }
+
+ @Test
+ public void testMatrixColumnVectorDivide() {
+ runMatrixColumnVectorTest("O = X / Y", "X", "Y", "O", "gpu_/");
+ }
+
+ @Test
+ public void testMatrixRowVectorDivide() {
+ runMatrixRowVectorTest("O = X / Y", "X", "Y", "O", "gpu_/");
+ }
+
+ // ****************************************************************
+ // ************************ IGNORED TEST **************************
+ // FIXME : There is a bug in CPU "^" when a A ^ B is executed where A & B are all zeroes
+ @Ignore
+ @Test
+ public void testPower() {
+ runMatrixMatrixElementwiseTest("O = X ^ Y", "X", "Y", "O", "gpu_%");
+ }
+
+ /**
+ * Runs a simple matrix-matrix elementwise op test
+ *
+ * @param scriptStr the script string
+ * @param input1 name of the first input variable in the script string
+ * @param input2 name of the second input variable in the script string
+ * @param output name of the output variable in the script string
+ * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ private void runMatrixMatrixElementwiseTest(String scriptStr, String input1, String input2, String output,
+ String heavyHitterOpcode) {
+ for (int i = 0; i < rowSizes.length; i++) {
+ for (int j = 0; j < columnSizes.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ int m = rowSizes[i];
+ int n = columnSizes[j];
+ double sparsity = sparsities[k];
+ Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+ Matrix Y = generateInputMatrix(spark, m, n, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(input1, X);
+ inputs.put(input2, Y);
+ List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+ List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+ //assertHeavyHitterPresent(heavyHitterOpcode);
+ assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+ }
+ }
+ }
+ }
+
+ /**
+ * Run O = aX +/- Y type operations test
+ *
+ * @param scriptStr the script string
+ * @param input1 name of the first matrix input variable in the script string
+ * @param input2 name of the second matrix input variable in the script string
+ * @param scalarInput name of the scalar which is multiplied with the first or second matrix
+ * @param output name of the output variable in the script string
+ * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ private void runAxpyTest(String scriptStr, String input1, String input2, String scalarInput, String output,
+ String heavyHitterOpcode) {
+ for (int i = 0; i < rowSizes.length; i++) {
+ for (int j = 0; j < columnSizes.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ for (int l = 0; l < scalars.length; l++) {
+ int m = rowSizes[i];
+ int n = columnSizes[j];
+ double scalar = scalars[l];
+ double sparsity = sparsities[k];
+ Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+ Matrix Y = generateInputMatrix(spark, m, n, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(input1, X);
+ inputs.put(input2, Y);
+ inputs.put(scalarInput, scalar);
+
+ // Test O = aX + Y
+ List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+ List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+ //assertHeavyHitterPresent(heavyHitterOpcode);
+ assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Run O = X op Y where X is a matrix, Y is a column vector
+ *
+ * @param scriptStr the script string
+ * @param matrixInput name of the matrix input variable in the script string
+ * @param vectorInput name of the vector input variable in the script string
+ * @param output name of the output variable in the script string
+ * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ private void runMatrixColumnVectorTest(String scriptStr, String matrixInput, String vectorInput, String output,
+ String heavyHitterOpcode) {
+ int[] rows = new int[] { 64, 130, 1024, 2049 };
+ int[] cols = new int[] { 64, 130, 1024, 2049 };
+
+ for (int i = 0; i < rows.length; i++) {
+ for (int j = 0; j < cols.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ int m = rows[i];
+ int n = cols[j];
+ double sparsity = sparsities[k];
+ Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+ Matrix Y = generateInputMatrix(spark, m, 1, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(matrixInput, X);
+ inputs.put(vectorInput, Y);
+
+ System.out.println("Vector[" + m + ", 1] op Matrix[" + m + ", " + n + "], sparsity = " + sparsity);
+ List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+ List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+ //assertHeavyHitterPresent(heavyHitterOpcode);
+ assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+
+ }
+ }
+ }
+ }
+
+ /**
+ * Run O = X op Y where X is a matrix, Y is a row vector
+ *
+ * @param scriptStr the script string
+ * @param matrixInput name of the matrix input variable in the script string
+ * @param vectorInput name of the vector input variable in the script string
+ * @param output name of the output variable in the script string
+ * @param heavyHitterOpcode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ private void runMatrixRowVectorTest(String scriptStr, String matrixInput, String vectorInput, String output,
+ String heavyHitterOpcode) {
+ int[] rows = new int[] { 64, 130, 1024, 2049 };
+ int[] cols = new int[] { 64, 130, 1024, 2049 };
+
+ for (int i = 0; i < rows.length; i++) {
+ for (int j = 0; j < cols.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ int m = rows[i];
+ int n = cols[j];
+ double sparsity = sparsities[k];
+ Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+ Matrix Y = generateInputMatrix(spark, 1, n, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(matrixInput, X);
+ inputs.put(vectorInput, Y);
+
+ System.out.println("Vector[" + m + ", 1] op Matrix[" + m + ", " + n + "], sparsity = " + sparsity);
+ List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+ List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+ //assertHeavyHitterPresent(heavyHitterOpcode);
+ assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
new file mode 100644
index 0000000..f7c7851
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests matrix multiplication on the GPU
+ */
+public class MatrixMultiplicationOpTest extends GPUTests {
+ private final static String TEST_NAME = "MatrixMultiplicationOpTest";
+ private final int seed = 42;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Override
+ public double getTHRESHOLD() {
+ return 1e-5;
+ }
+
+ @Test
+ public void matrixMatrixTest1() {
+ String scriptStr = "O = X %*% Y";
+
+ int[] X1 = { 1, 128, 513, 1024 };
+ int[] X2 = { 128, 512, 1024 };
+ int[] Y2 = { 1, 128, 513, 1024 };
+ double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ int y1 = x2;
+ for (int y2 = 0; y2 < Y2.length; y2++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X2[y1], Y2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void matrixMatrixTest2() {
+ String scriptStr = "O = X %*% t(Y)";
+
+ int[] X1 = { 1, 128, 513, 1024 };
+ int[] X2 = { 128, 512, 1024 };
+ int[] Y1 = { 1, 128, 513, 1024 };
+ double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ int y2 = x2;
+ for (int y1 = 0; y1 < Y1.length; y1++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], Y1[x2], X2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void matrixMatrixTest3() {
+ String scriptStr = "O = t(X) %*% Y";
+
+ int[] X1 = { 1, 128, 513, 1024 };
+ int[] X2 = { 128, 512, 1024 };
+ int[] Y2 = { 1, 128, 513, 1024 };
+ double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ int y1 = x1;
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ for (int y2 = 0; y2 < Y2.length; y2++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], X1[y1], Y2[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void matrixMatrixTest4() {
+ String scriptStr = "O = t(X) %*% t(Y)";
+
+ int[] X1 = { 1, 128, 513, 1024 };
+ int[] X2 = { 128, 512, 1024 };
+ int[] Y1 = { 1, 128, 513, 1024 };
+ double[] SX = { 0.0, 0.03, 0.3, 0.9 };
+ double[] SY = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int x1 = 0; x1 < X1.length; x1++) {
+ int y2 = x1;
+ for (int x2 = 0; x2 < X2.length; x2++) {
+ for (int y1 = 0; y1 < Y1.length; y1++) {
+ for (int sx = 0; sx < SX.length; sx++) {
+ for (int sy = 0; sy < SY.length; sy++) {
+ assertMatrixMultiplication(scriptStr, X1[x1], X2[x2], Y1[y1], X1[y2], SX[sx], SY[sy]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void transposeSelfMatrixMultiply() {
+ String scriptStr = "O = t(X) %*% X";
+
+ int[] sizes = { 1, 128, 512, 1024, 2049 };
+ double[] sparsities = { 0.0, 0.03, 0.3, 0.9 };
+
+ for (int i = 0; i < sizes.length; i++) {
+ for (int j = 0; j < sparsities.length; j++) {
+ int side = sizes[i];
+ double sparsity = sparsities[j];
+ Matrix X = generateInputMatrix(spark, side, side, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("X", X);
+ List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("O"));
+ List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("O"));
+ //assertHeavyHitterPresent("gpu_tsmm'");
+ assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+ }
+ }
+ }
+
+ /**
+ * Assert that matrix multiplication is the same on gpu and cpu
+ *
+ * @param scriptStr script string that has matrix multiplication (eg : O = X %*% Y)
+ * @param rows1 rows in X
+ * @param cols1 cols in X
+ * @param rows2 rows in Y
+ * @param cols2 cols in Y
+ * @param sparsity1 sparsity for X
+ * @param sparsity2 sparsity for Y
+ */
+ private void assertMatrixMultiplication(String scriptStr, int rows1, int cols1, int rows2, int cols2,
+ double sparsity1, double sparsity2) {
+ HashMap<String, Object> inputs = new HashMap<>();
+ Matrix X = generateInputMatrix(spark, rows1, cols1, sparsity1, seed);
+ Matrix Y = generateInputMatrix(spark, rows2, cols2, sparsity2, seed);
+ inputs.put("X", X);
+ inputs.put("Y", Y);
+ List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("O"));
+ List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("O"));
+ //assertHeavyHitterPresent("gpu_ba+*'");
+ assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
new file mode 100644
index 0000000..f1f1ea5
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/NeuralNetworkOpTests.java
@@ -0,0 +1,508 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Test neural network operations on the GPU
+ */
+public class NeuralNetworkOpTests extends GPUTests {
+
+ private final static String TEST_NAME = "NeuralNetworkOpTests";
+ private final int seed = 42;
+
+ // The MAX_OP_SIZE is to take into consideration the memory available on the GPU as well as
+ // limits set by cudnn (operands need to be less than 2GB)
+ private static final double MAX_OP_SIZE;
+
+ static {
+ double MAX = 0.5 * 1024 * 1024 * 1024; // 0.5 GB (this HAS to be less than 2GB)
+ try {
+ // Cap the maximum allowed operand size to 1/3rd of the usable GPU memory or MAX, whichever is lesser
+ GPUContext gCtx = GPUContextPool.getFromPool();
+ long availableMemory = gCtx.getAvailableMemory();
+ double averageMemoryPerOperand = availableMemory / 3.0;
+ MAX_OP_SIZE = Math.min(averageMemoryPerOperand, MAX);
+ GPUContextPool.returnToPool(gCtx);
+ } catch (DMLRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+ // More comprehensive but time consuming tests
+ /*
+ private final List<Integer> Nlst = Arrays.asList(128, 64, 32);
+ private final List<Integer> Clst = Arrays.asList(30, 20, 3);
+ private final List<Integer> Hlst = Arrays.asList(400, 128, 32);
+ private final List<Integer> Wlst = Arrays.asList(400, 128, 32);
+ private final List<Integer> Klst = Arrays.asList(30, 20, 10);
+ private final List<Integer> Rlst = Arrays.asList(128, 63, 4);
+ private final List<Integer> Slst = Arrays.asList(128, 63, 4);
+ private final List<Integer> strideHeightLst = Arrays.asList(9, 3);
+ private final List<Integer> strideWidthLst = Arrays.asList(9, 3);
+ private final List<Integer> padHeightLst = Arrays.asList(3, 1);
+ private final List<Integer> padWidthLst = Arrays.asList(3, 1);
+ private final List<Double> sparsitylst = Arrays.asList(1.0); // Only test for dense
+ */
+
+ private final List<Integer> Nlst = Arrays.asList(128, 64);
+ private final List<Integer> Clst = Arrays.asList(30, 3);
+ private final List<Integer> Hlst = Arrays.asList(256, 64);
+ private final List<Integer> Wlst = Arrays.asList(256, 64);
+ private final List<Integer> Klst = Arrays.asList(30, 20);
+ private final List<Integer> Rlst = Arrays.asList(128, 3, 1);
+ private final List<Integer> Slst = Arrays.asList(128, 3, 1);
+ private final List<Integer> strideHeightLst = Arrays.asList(9, 1);
+ private final List<Integer> strideWidthLst = Arrays.asList(9, 1);
+ private final List<Integer> padHeightLst = Arrays.asList(3, 1);
+ private final List<Integer> padWidthLst = Arrays.asList(3, 1);
+ private final List<Double> sparsitylst = Arrays.asList(1.0); // Only test for dense
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Override
+ public double getTHRESHOLD() {
+ return 1e-5;
+ }
+
+ @Test
+ public void testConv2d() {
+ String scriptStr = "O = conv2d(image, filter, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+ for (long N : Nlst) {
+ for (long C : Clst) {
+ for (long H : Hlst) {
+ for (long W : Wlst) {
+ for (long K : Klst) {
+ for (long R : Rlst) {
+ for (long S : Slst) {
+ for (long strideH : strideHeightLst) {
+ for (long strideW : strideWidthLst) {
+ for (long padH : padHeightLst) {
+ for (long padW : padWidthLst) {
+ for (double sparsity : sparsitylst) {
+
+ // Make sure ops fit in GPU memory and within constraints of cudnn
+ long imageSize = N * C * H * W * 8l;
+ if (imageSize > MAX_OP_SIZE) // image size
+ continue;
+ long filterSize = K * C * R * S * 8l;
+ if (filterSize > MAX_OP_SIZE) // filter size
+ continue;
+ // filter is smaller than image + padding
+ if (R > (H + padH) || S > (W + padW))
+ continue;
+
+ int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+ int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+ long doutSize = N * K * P * Q * 8l;
+ if (doutSize > MAX_OP_SIZE) // dout/output size
+ continue;
+
+ double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+ double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+ double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+ System.out
+ .format("conv2d, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+ N, C, H, W, imageSizeInMB, N, C, R, S,
+ filterSizeInMB, N, K, P, Q, doutSizeInMB,
+ strideH, strideW, padH, padW);
+ Matrix image = generateInputMatrix(spark, (int) N,
+ (int) (C * H * W), sparsity, seed);
+ Matrix filter = generateInputMatrix(spark, (int) K,
+ (int) (C * R * S), sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("N", N);
+ inputs.put("C", C);
+ inputs.put("H", H);
+ inputs.put("W", W);
+ inputs.put("K", K);
+ inputs.put("R", R);
+ inputs.put("S", S);
+ inputs.put("strideH", strideH);
+ inputs.put("strideW", strideW);
+ inputs.put("padH", padH);
+ inputs.put("padW", padW);
+ inputs.put("image", image);
+ inputs.put("filter", filter);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ assertHeavyHitterPresent("gpu_conv2d");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ clearGPUMemory();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testConv2dBackwardFilter() {
+ String scriptStr = "O = conv2d_backward_filter(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+ for (long N : Nlst) {
+ for (long C : Clst) {
+ for (long H : Hlst) {
+ for (long W : Wlst) {
+ for (long K : Klst) {
+ for (long R : Rlst) {
+ for (long S : Slst) {
+ for (long strideH : strideHeightLst) {
+ for (long strideW : strideWidthLst) {
+ for (long padH : padHeightLst) {
+ for (long padW : padWidthLst) {
+ for (double sparsity : sparsitylst) {
+
+ // filter is smaller than image + padding
+ if (R > (H + padH) || S > (W + padW))
+ continue;
+
+ // Make sure ops fit in GPU memory and within constraints of cudnn
+ long imageSize = N * C * H * W * 8l;
+ if (imageSize > MAX_OP_SIZE) // image size
+ continue;
+ long filterSize = K * C * R * S * 8l;
+ if (filterSize > MAX_OP_SIZE) // filter size
+ continue;
+
+ int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+ int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+ long doutSize = N * K * P * Q * 8l;
+ if (doutSize > MAX_OP_SIZE) // dout/output size
+ continue;
+
+ double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+ double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+ double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+ System.out
+ .format("conv2d_backward_filter, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+ N, C, H, W, imageSizeInMB, N, C, R, S,
+ filterSizeInMB, N, K, P, Q, doutSizeInMB,
+ strideH, strideW, padH, padW);
+ Matrix image = generateInputMatrix(spark, (int) N,
+ (int) (C * H * W), sparsity, seed);
+ Matrix dout = generateInputMatrix(spark, (int) N,
+ (int) (K * P * Q), sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("N", N);
+ inputs.put("C", C);
+ inputs.put("H", H);
+ inputs.put("W", W);
+ inputs.put("K", K);
+ inputs.put("R", R);
+ inputs.put("S", S);
+ inputs.put("strideH", strideH);
+ inputs.put("strideW", strideW);
+ inputs.put("padH", padH);
+ inputs.put("padW", padW);
+ inputs.put("image", image);
+ inputs.put("dout", dout);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ assertHeavyHitterPresent("gpu_conv2d_backward_filter");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ clearGPUMemory();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testConv2dBackwardData() {
+ String scriptStr = "O = conv2d_backward_data(filter, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], filter_shape=[K,C,R,S])";
+
+ for (long N : Nlst) {
+ for (long C : Clst) {
+ for (long H : Hlst) {
+ for (long W : Wlst) {
+ for (long K : Klst) {
+ for (long R : Rlst) {
+ for (long S : Slst) {
+ for (long strideH : strideHeightLst) {
+ for (long strideW : strideWidthLst) {
+ for (long padH : padHeightLst) {
+ for (long padW : padWidthLst) {
+ for (double sparsity : sparsitylst) {
+
+ // filter is smaller than image + padding
+ if (R > (H + padH) || S > (W + padW))
+ continue;
+
+ // Make sure ops fit in GPU memory and within constraints of cudnn
+ long imageSize = N * C * H * W * 8l;
+ if (imageSize > MAX_OP_SIZE) // image size
+ continue;
+ long filterSize = K * C * R * S * 8l;
+ if (filterSize > MAX_OP_SIZE) // filter size
+ continue;
+
+ int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+ int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+ long doutSize = N * K * P * Q * 8l;
+ if (doutSize > MAX_OP_SIZE) // dout/output size
+ continue;
+
+ double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+ double filterSizeInMB = filterSize / (1024.0 * 1024.0);
+ double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+ System.out
+ .format("conv2d_backward_data, image[%d,%d,%d,%d](%.1fMB), filter[%d,%d,%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+ N, C, H, W, imageSizeInMB, N, C, R, S,
+ filterSizeInMB, N, K, P, Q, doutSizeInMB,
+ strideH, strideW, padH, padW);
+
+ Matrix filter = generateInputMatrix(spark, (int) K,
+ (int) (C * R * S), sparsity, seed);
+ Matrix dout = generateInputMatrix(spark, (int) N,
+ (int) (K * P * Q), sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("N", N);
+ inputs.put("C", C);
+ inputs.put("H", H);
+ inputs.put("W", W);
+ inputs.put("K", K);
+ inputs.put("R", R);
+ inputs.put("S", S);
+ inputs.put("strideH", strideH);
+ inputs.put("strideW", strideW);
+ inputs.put("padH", padH);
+ inputs.put("padW", padW);
+ inputs.put("filter", filter);
+ inputs.put("dout", dout);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ assertHeavyHitterPresent("gpu_conv2d_backward_data");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ clearGPUMemory();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testMaxPool() {
+ String scriptStr = "O = max_pool(image, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+ for (long N : Nlst) {
+ for (long C : Clst) {
+ for (long H : Hlst) {
+ for (long W : Wlst) {
+ for (long R : Rlst) {
+ for (long S : Slst) {
+ for (long strideH : strideHeightLst) {
+ for (long strideW : strideWidthLst) {
+ for (long padH : padHeightLst) {
+ for (long padW : padWidthLst) {
+ for (double sparsity : sparsitylst) {
+
+ // pool is smaller than image + padding
+ if (R > (H + padH) || S > (W + padW))
+ continue;
+
+ // Make sure ops fit in GPU memory and within constraints of cudnn
+ long imageSize = N * C * H * W * 8l;
+ if (imageSize > MAX_OP_SIZE) // image size
+ continue;
+ long poolSize = R * S * 8l;
+ if (poolSize > MAX_OP_SIZE) // filter size
+ continue;
+
+ int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+ int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+ long doutSize = N * C * P * Q * 8l;
+ if (doutSize > MAX_OP_SIZE) // dout/output size
+ continue;
+
+ double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+ double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+ double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+ System.out
+ .format("max_pool, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+ N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+ P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+ Matrix image = generateInputMatrix(spark, (int) N,
+ (int) (C * H * W), sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("N", N);
+ inputs.put("C", C);
+ inputs.put("H", H);
+ inputs.put("W", W);
+ inputs.put("R", R);
+ inputs.put("S", S);
+ inputs.put("strideH", strideH);
+ inputs.put("strideW", strideW);
+ inputs.put("padH", padH);
+ inputs.put("padW", padW);
+ inputs.put("image", image);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ assertHeavyHitterPresent("gpu_maxpooling");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ clearGPUMemory();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testMaxPoolBackward() {
+ String scriptStr = "O = max_pool_backward(image, dout, padding=[padH, padW], stride=[strideH, strideW], input_shape=[N,C,H,W], pool_size=[R,S])";
+
+ for (long N : Nlst) {
+ for (long C : Clst) {
+ for (long H : Hlst) {
+ for (long W : Wlst) {
+ for (long R : Rlst) {
+ for (long S : Slst) {
+ for (long strideH : strideHeightLst) {
+ for (long strideW : strideWidthLst) {
+ for (long padH : padHeightLst) {
+ for (long padW : padWidthLst) {
+ for (double sparsity : sparsitylst) {
+
+ // pool is smaller than image + padding
+ if (R > (H + padH) || S > (W + padW))
+ continue;
+
+ // Make sure ops fit in GPU memory and within constraints of cudnn
+ long imageSize = N * C * H * W * 8l;
+ if (imageSize > MAX_OP_SIZE) // image size
+ continue;
+ long poolSize = R * S * 8l;
+ if (poolSize > MAX_OP_SIZE) // filter size
+ continue;
+
+ int P = (int) ConvolutionUtils.getP(H, R, strideH, padH);
+ int Q = (int) ConvolutionUtils.getQ(W, S, strideW, padW);
+
+ long doutSize = N * C * P * Q * 8l;
+ if (doutSize > MAX_OP_SIZE) // dout/output size
+ continue;
+
+ double imageSizeInMB = imageSize / (1024.0 * 1024.0);
+ double poolSizeInMB = poolSize / (1024.0 * 1024.0);
+ double doutSizeInMB = doutSize / (1024.0 * 1024.0);
+ System.out
+ .format("max_pool_backward, image[%d,%d,%d,%d](%.1fMB), pool[%d,%d](%.1f), dout[%d,%d,%d,%d](%.1fMB), stride[%d,%d], padding[%d,%d]",
+ N, C, H, W, imageSizeInMB, R, S, poolSizeInMB, N, C,
+ P, Q, doutSizeInMB, strideH, strideW, padH, padW);
+
+ Matrix image = generateInputMatrix(spark, (int) N,
+ (int) (C * H * W), sparsity, seed);
+ Matrix dout = generateInputMatrix(spark, (int) N, (int) (C * P * Q),
+ sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("N", N);
+ inputs.put("C", C);
+ inputs.put("H", H);
+ inputs.put("W", W);
+ inputs.put("R", R);
+ inputs.put("S", S);
+ inputs.put("strideH", strideH);
+ inputs.put("strideW", strideW);
+ inputs.put("padH", padH);
+ inputs.put("padW", padW);
+ inputs.put("image", image);
+ inputs.put("dout", dout);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs,
+ Arrays.asList("O"));
+ assertHeavyHitterPresent("gpu_maxpooling_backward");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ clearGPUMemory();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java b/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
new file mode 100644
index 0000000..b5b71f8
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/ReorgOpTests.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests for GPU transpose
+ */
+public class ReorgOpTests extends GPUTests {
+
+ private final static String TEST_NAME = "ReorgOpTests";
+ private final int[] rowSizes = new int[] { 1, 64, 130, 1024, 2049 };
+ private final int[] columnSizes = new int[] { 1, 64, 130, 1024, 2049 };
+ private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+ private final int seed = 42;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void transposeTest() {
+ String scriptStr = "out = t(in1)";
+
+ for (int i = 0; i < rowSizes.length; i++) {
+ for (int j = 0; j < columnSizes.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ int m = rowSizes[i];
+ int n = columnSizes[j];
+ double sparsity = sparsities[k];
+ HashMap<String, Object> inputs = new HashMap<>();
+ Matrix in1 = generateInputMatrix(spark, m, n, sparsity, seed);
+ inputs.put("in1", in1);
+ List<Object> cpuOuts = runOnCPU(spark, scriptStr, inputs, Arrays.asList("out"));
+ List<Object> gpuOuts = runOnGPU(spark, scriptStr, inputs, Arrays.asList("out"));
+ //assertHeavyHitterPresent("gpu_r'");
+ assertEqualObjects(cpuOuts.get(0), gpuOuts.get(0));
+ }
+ }
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
new file mode 100644
index 0000000..65e6365
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/ScalarMatrixElementwiseOpTests.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Tests scalar-matrix element wise operations on the GPU
+ */
+public class ScalarMatrixElementwiseOpTests extends GPUTests {
+
+ private final static String TEST_NAME = "ScalarMatrixElementwiseOpTests";
+
+ private final int[] rowSizes = new int[] { 1, 64, 130, 2049 };
+ private final int[] columnSizes = new int[] { 1, 64, 130, 2049 };
+ private final double[] sparsities = new double[] { 0.0, 0.03, 0.3, 0.9 };
+ private final int seed = 42;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void testPlusRightScalar() {
+ runScalarMatrixElementWiseTests("O = X + scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 20.0 }, "gpu_+");
+ }
+
+ @Test
+ public void testPlusLeftScalar() {
+ runScalarMatrixElementWiseTests("O = scalar + X", "X", "scalar", "O", new double[] { 0.0, 0.5, 20.0 }, "gpu_+");
+ }
+
+ @Test
+ public void testMinusRightScalar() {
+ runScalarMatrixElementWiseTests("O = X - scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 1.0 }, "gpu_-");
+ }
+
+ @Test
+ public void testMinusLeftScalar() {
+ runScalarMatrixElementWiseTests("O = scalar - X", "X", "scalar", "O", new double[] { 0.0, 0.5, 1.0 }, "gpu_-");
+ }
+
+ @Test
+ public void testMultRightScalar() {
+ runScalarMatrixElementWiseTests("O = X * scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 2.0 }, "gpu_*");
+ }
+
+ @Test
+ public void testMultLeftScalar() {
+ runScalarMatrixElementWiseTests("O = scalar * X", "X", "scalar", "O", new double[] { 0.0, 0.5, 2.0 }, "gpu_*");
+ }
+
+ @Test
+ public void testDivide() {
+ runScalarMatrixElementWiseTests("O = X / scalar", "X", "scalar", "O", new double[] { 0.0, 0.5, 5.0 }, "gpu_/");
+ }
+
+ // ****************************************************************
+ // ************************ IGNORED TEST **************************
+ // FIXME : There is a bug in CPU "^" when a A ^ B is executed where A & B are all zeroes
+ @Ignore
+ @Test
+ public void testPow() {
+ runScalarMatrixElementWiseTests("O = X ^ scalar", "X", "scalar", "O", new double[] { 0.0, 2.0, 10.0 }, "gpu_^");
+ }
+
+ /**
+ * Runs a simple scalar-matrix elementwise op test
+ *
+ * @param scriptStr the script string
+ * @param inputMatrix name of the matrix input in the script string
+ * @param inputScalar name of the scalar input in the script string
+ * @param output name of the output variable in the script string
+ * @param scalars array of scalars for which to run this test
+ * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ private void runScalarMatrixElementWiseTests(String scriptStr, String inputMatrix, String inputScalar,
+ String output, double[] scalars, String heavyHitterOpCode) {
+ for (int i = 0; i < rowSizes.length; i++) {
+ for (int j = 0; j < columnSizes.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ for (int l = 0; l < scalars.length; l++) {
+ int m = rowSizes[i];
+ int n = columnSizes[j];
+ double sparsity = sparsities[k];
+ double scalar = scalars[l];
+ System.out.println(
+ "Matrix is of size [" + m + ", " + n + "], sparsity = " + sparsity + ", scalar = "
+ + scalar);
+ Matrix X = generateInputMatrix(spark, m, n, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(inputMatrix, X);
+ inputs.put(inputScalar, scalar);
+ List<Object> cpuOut = runOnCPU(spark, scriptStr, inputs, Arrays.asList(output));
+ List<Object> gpuOut = runOnGPU(spark, scriptStr, inputs, Arrays.asList(output));
+ //assertHeavyHitterPresent(heavyHitterOpCode);
+ assertEqualObjects(cpuOut.get(0), gpuOut.get(0));
+ }
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
new file mode 100644
index 0000000..84b1f73
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTests.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Unit tests for Unary ops on GPU
+ */
+public class UnaryOpTests extends UnaryOpTestsBase {
+
+ private final static String TEST_NAME = "UnaryOpTests";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void testSin() throws Exception {
+ testSimpleUnaryOpMatrixOutput("sin", "gpu_sin");
+ }
+
+ @Test
+ public void testCos() throws Exception {
+ testSimpleUnaryOpMatrixOutput("cos", "gpu_cos");
+ }
+
+ @Test
+ public void testTan() throws Exception {
+ testSimpleUnaryOpMatrixOutput("tan", "gpu_tan");
+ }
+
+ @Test
+ public void testAsin() throws Exception {
+ testSimpleUnaryOpMatrixOutput("asin", "gpu_asin");
+ }
+
+ @Test
+ public void testAcos() throws Exception {
+ testSimpleUnaryOpMatrixOutput("acos", "gpu_acos");
+ }
+
+ @Test
+ public void testAtan() throws Exception {
+ testSimpleUnaryOpMatrixOutput("atan", "gpu_atan");
+ }
+
+ @Test
+ public void testExp() throws Exception {
+ testSimpleUnaryOpMatrixOutput("exp", "gpu_exp");
+ }
+
+ @Test
+ public void testLog() throws Exception {
+ testSimpleUnaryOpMatrixOutput("log", "gpu_log");
+ }
+
+ @Test
+ public void testSqrt() throws Exception {
+ testSimpleUnaryOpMatrixOutput("sqrt", "gpu_sqrt");
+ }
+
+ @Test
+ public void testAbs() throws Exception {
+ testSimpleUnaryOpMatrixOutput("abs", "gpu_abs");
+ }
+
+ @Test
+ public void testRound() throws Exception {
+ testSimpleUnaryOpMatrixOutput("round", "gpu_round");
+ }
+
+ @Test
+ public void testFloor() throws Exception {
+ testSimpleUnaryOpMatrixOutput("sqrt", "gpu_floor");
+ }
+
+ @Test
+ public void testCeil() throws Exception {
+ testSimpleUnaryOpMatrixOutput("ceil", "gpu_ceil");
+ }
+
+ @Test
+ public void testSign() throws Exception {
+ testSimpleUnaryOpMatrixOutput("sign", "gpu_sign");
+ }
+
+ @Test
+ public void testSelp() throws Exception {
+ testUnaryOpMatrixOutput("out = max(in1, 0)", "gpu_selp", "in1", "out");
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
new file mode 100644
index 0000000..0051dd4
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+
+/**
+ * Abstract class for all Unary Op tests
+ */
+public abstract class UnaryOpTestsBase extends GPUTests {
+
+ // Set of rows and column sizes & sparsities to test unary ops
+ private final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 };
+ private final int[] columnSizes = new int[] { 2049, 1024, 140, 64, 1 };
+ private final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 };
+ private final int seed = 42;
+
+ /**
+ * Tests unary ops with a variety of matrix shapes and sparsities.
+ * Test is skipped for blocks of size 1x1.
+ *
+ * @param function name of the dml builtin unary op
+ * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+ */
+ protected void testSimpleUnaryOpMatrixOutput(String function, String heavyHitterOpCode) {
+ String scriptStr = "out = " + function + "(in1)";
+ testUnaryOpMatrixOutput(scriptStr, heavyHitterOpCode, "in1", "out");
+ }
+
+ /**
+ * Tests slightly more involved unary ops with a variety of matrix shapes and sparsities.
+ * Test is skipped for blocks of size 1x1
+ *
+ * @param scriptStr script string
+ * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+ * @param inStr name of input variable in provided script string
+ * @param outStr name of output variable in script string
+ */
+ protected void testUnaryOpMatrixOutput(String scriptStr, String heavyHitterOpCode, String inStr, String outStr) {
+ int[] rows = rowSizes;
+ int[] columns = columnSizes;
+ double[] sparsities = this.sparsities;
+ int seed = this.seed;
+
+ for (int i = 0; i < rows.length; i++) {
+ for (int j = 0; j < columns.length; j++) {
+ for (int k = 0; k < sparsities.length; k++) {
+ int row = rows[i];
+ int column = columns[j];
+ double sparsity = sparsities[k];
+ // Skip the case of a scalar unary op
+ if (row == 1 && column == 1)
+ continue;
+
+ testUnaryOpMatrixOutput(scriptStr, heavyHitterOpCode, inStr, outStr, seed, row, column, sparsity);
+ }
+ }
+ }
+ }
+
+ /**
+ * Tests a single unary op with inputs and outputs of the specified size and sparsity
+ *
+ * @param scriptStr script string
+ * @param heavyHitterOpCode the string printed for the unary op heavy hitter when executed on gpu
+ * @param inStr name of input variable in provided script string
+ * @param outStr name of output variable in script string
+ * @param seed seed for the random number generator for the random input matrix
+ * @param row number of rows of input matrix
+ * @param column number of rows of input matrix
+ * @param sparsity sparsity of the input matrix
+ */
+ public void testUnaryOpMatrixOutput(String scriptStr, String heavyHitterOpCode, String inStr, String outStr,
+ int seed, int row, int column, double sparsity) {
+ System.out.println("Matrix of size [" + row + ", " + column + "], sparsity = " + sparsity);
+ Matrix in1 = generateInputMatrix(spark, row, column, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put(inStr, in1);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList(outStr));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList(outStr));
+ //assertHeavyHitterPresent(heavyHitterOpCode);
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
new file mode 100644
index 0000000..d5e3bc0
--- /dev/null
+++ b/src/test_suites/java/org/apache/sysml/test/integration/gpu/ZPackageSuite.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.gpu;
+
+import org.apache.sysml.test.gpu.AggregateUnaryOpTests;
+import org.apache.sysml.test.gpu.BinaryOpTests;
+import org.apache.sysml.test.gpu.MatrixMatrixElementWiseOpTests;
+import org.apache.sysml.test.gpu.MatrixMultiplicationOpTest;
+import org.apache.sysml.test.gpu.NeuralNetworkOpTests;
+import org.apache.sysml.test.gpu.ReorgOpTests;
+import org.apache.sysml.test.gpu.ScalarMatrixElementwiseOpTests;
+import org.apache.sysml.test.gpu.UnaryOpTests;
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.junit.runners.Suite.SuiteClasses;
+
+@RunWith(Suite.class) @SuiteClasses({
+ BinaryOpTests.class,
+ ScalarMatrixElementwiseOpTests.class,
+ MatrixMatrixElementWiseOpTests.class,
+ ReorgOpTests.class,
+ AggregateUnaryOpTests.class,
+ UnaryOpTests.class,
+ MatrixMultiplicationOpTest.class,
+ NeuralNetworkOpTests.class,
+})
+public class ZPackageSuite {
+
+}
[2/2] incubator-systemml git commit: [SYSTEMML-1625] GPU Unit Tests
(and GPU row/col variance bug fix)
Posted by na...@apache.org.
[SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)
- Documented random matrix generation
- GPU unit test using MLContext. Compares CPU output to GPU
- Pseudo-unit tests for GPU implementations of
unary ops, unary aggregate ops, transpose, elementwise ops,
matrix multiplication ops, builtin ops & NN ops
- Fixed crucial bug in col/row var
- gpuTests profile for GPU tests (mvn verify -PgpuTests)
- Updated intellij style for import order
Closes #513
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/772fb588
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/772fb588
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/772fb588
Branch: refs/heads/master
Commit: 772fb588324916e4225bb6e1970ca6a8f87eb414
Parents: ceeec4b
Author: Nakul Jindal <na...@gmail.com>
Authored: Wed May 31 21:54:13 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Wed May 31 21:54:13 2017 -0700
----------------------------------------------------------------------
dev/code-style/systemml-style-intellij.xml | 18 +
pom.xml | 10 +
.../apache/sysml/api/ScriptExecutorUtils.java | 1 +
.../context/ExecutionContext.java | 3 +
.../instructions/GPUInstructionParser.java | 120 +++--
.../instructions/gpu/context/GPUContext.java | 118 +++--
.../instructions/gpu/context/GPUObject.java | 55 +-
.../instructions/gpu/context/JCudaKernels.java | 3 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 112 ++--
.../runtime/matrix/data/LibMatrixDatagen.java | 78 +--
.../matrix/data/RandomMatrixGenerator.java | 123 ++++-
.../sysml/test/gpu/AggregateUnaryOpTests.java | 133 +++++
.../apache/sysml/test/gpu/BinaryOpTests.java | 85 ++++
.../org/apache/sysml/test/gpu/GPUTests.java | 250 +++++++++
.../gpu/MatrixMatrixElementWiseOpTests.java | 271 ++++++++++
.../test/gpu/MatrixMultiplicationOpTest.java | 190 +++++++
.../sysml/test/gpu/NeuralNetworkOpTests.java | 508 +++++++++++++++++++
.../org/apache/sysml/test/gpu/ReorgOpTests.java | 70 +++
.../gpu/ScalarMatrixElementwiseOpTests.java | 131 +++++
.../org/apache/sysml/test/gpu/UnaryOpTests.java | 113 +++++
.../apache/sysml/test/gpu/UnaryOpTestsBase.java | 106 ++++
.../test/integration/gpu/ZPackageSuite.java | 46 ++
22 files changed, 2308 insertions(+), 236 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml
index 248c600..1ad3209 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -16,7 +16,25 @@
* specific language governing permissions and limitations
* under the License.
-->
+
<code_scheme name="SystemML Format">
+ <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+ <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+ <option name="IMPORT_LAYOUT_TABLE">
+ <value>
+ <package name="" withSubpackages="true" static="false" />
+ <emptyLine />
+ <package name="java" withSubpackages="true" static="false" />
+ <emptyLine />
+ <package name="javax" withSubpackages="true" static="false" />
+ <emptyLine />
+ <package name="org" withSubpackages="true" static="false" />
+ <emptyLine />
+ <package name="com" withSubpackages="true" static="false" />
+ <emptyLine />
+ <package name="" withSubpackages="true" static="true" />
+ </value>
+ </option>
<codeStyleSettings language="JAVA">
<option name="KEEP_LINE_BREAKS" value="false" />
<option name="KEEP_FIRST_COLUMN_COMMENT" value="false" />
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 5ce5576..99e2dec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -436,6 +436,7 @@
<exclude>**/slowtest/**</exclude>
<exclude>**/integration/**</exclude>
<exclude>**/test/unit/**</exclude>
+ <exclude>**/test/gpu/**</exclude>
</excludes>
</configuration>
@@ -478,6 +479,7 @@
-Djava.awt.headless=true</argLine>
<includes>
+ <include>${gpuTestsPath}</include> <!-- Path for GPU integration tests, enabled for gpuTests profile -->
<include>**/integration/applications/**/*Suite.java</include>
<include>**/integration/conversion/*Suite.java</include>
<include>**/integration/functions/data/*Suite.java</include>
@@ -896,6 +898,14 @@
</build>
</profile>
+ <!-- profile to enable running tests on the GPU -->
+ <profile>
+ <id>gpuTests</id>
+ <properties>
+ <gpuTestsPath>**/integration/gpu/**/*Suite.java</gpuTestsPath>
+ </properties>
+ </profile>
+
<profile>
<!-- Can be used to ignore doclint javadoc issues -->
<id>ignore-doclint</id>
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index f582c36..674a011 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -94,6 +94,7 @@ public class ScriptExecutorUtils {
rtprog.execute(ec);
} finally { // ensure cleanup/shutdown
if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) {
+ ec.getGPUContext().clearTemporaryMemory();
GPUContextPool.returnToPool(ec.getGPUContext());
}
if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 35b4cd1..735f394 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -271,6 +271,9 @@ public class ExecutionContext {
MatrixObject mo = getMatrixObject(varName);
if( mo.getGPUObject(getGPUContext()) == null ) {
GPUObject newGObj = getGPUContext().createGPUObject(mo);
+ // The lock is added here for an output block
+ // so that any block currently in use is not deallocated by eviction on the GPU
+ newGObj.addLock();
mo.setGPUObject(getGPUContext(), newGObj);
}
return mo;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 443d0eb..e0bcd1b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -42,81 +42,79 @@ public class GPUInstructionParser extends InstructionParser
// Neural Network Operators
String2GPUInstructionType.put( "relu_backward", GPUINSTRUCTION_TYPE.Convolution);
String2GPUInstructionType.put( "conv2d", GPUINSTRUCTION_TYPE.Convolution);
- String2GPUInstructionType.put( "conv2d_bias_add", GPUINSTRUCTION_TYPE.Convolution);
+ String2GPUInstructionType.put( "conv2d_bias_add", GPUINSTRUCTION_TYPE.Convolution);
String2GPUInstructionType.put( "conv2d_backward_filter", GPUINSTRUCTION_TYPE.Convolution);
String2GPUInstructionType.put( "conv2d_backward_data", GPUINSTRUCTION_TYPE.Convolution);
String2GPUInstructionType.put( "maxpooling", GPUINSTRUCTION_TYPE.Convolution);
String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Convolution);
- String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution);
- String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution);
+ String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution);
+ String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution);
// Matrix Multiply Operators
- String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
- String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
+ String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary);
+ String2GPUInstructionType.put( "tsmm", GPUINSTRUCTION_TYPE.MMTSJ);
// Reorg/Transpose
- String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg);
+ String2GPUInstructionType.put( "r'", GPUINSTRUCTION_TYPE.Reorg);
// Binary Cellwise
- String2GPUInstructionType.put( "+" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "-" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "/" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "%%" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "%/%" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "^" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "1-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
- String2GPUInstructionType.put( "^2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
- String2GPUInstructionType.put( "*2" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
- String2GPUInstructionType.put( "-nz" , GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
- String2GPUInstructionType.put( "+*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
- String2GPUInstructionType.put( "-*" , GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "+", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "-", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "*", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "/", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ //String2GPUInstructionType.put( "%%", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ //String2GPUInstructionType.put( "%/%", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "^", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "1-*", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+ String2GPUInstructionType.put( "^2", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
+ String2GPUInstructionType.put( "*2", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+ String2GPUInstructionType.put( "-nz", GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
+ String2GPUInstructionType.put( "+*", GPUINSTRUCTION_TYPE.ArithmeticBinary);
+ String2GPUInstructionType.put( "-*", GPUINSTRUCTION_TYPE.ArithmeticBinary);
- // Builtin functions
- String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "exp" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "log" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "abs" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "sqrt" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "round" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "floor" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "ceil" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "sin" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "cos" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "tan" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "asin" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "acos" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "atan" , GPUINSTRUCTION_TYPE.BuiltinUnary);
- String2GPUInstructionType.put( "sign" , GPUINSTRUCTION_TYPE.BuiltinUnary);
-
-
-
- String2GPUInstructionType.put( "solve" , GPUINSTRUCTION_TYPE.BuiltinBinary);
+ // Unary Builtin functions
+ String2GPUInstructionType.put( "sel+", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "exp", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "log", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "abs", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sqrt", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "round", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "floor", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "ceil", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sin", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "cos", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "tan", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "asin", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "acos", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "atan", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ String2GPUInstructionType.put( "sign", GPUINSTRUCTION_TYPE.BuiltinUnary);
+ // Binary Builtin functions
+ String2GPUInstructionType.put( "solve", GPUINSTRUCTION_TYPE.BuiltinBinary);
// Aggregate Unary
- String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
- String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
- String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
- String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
- String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
- String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
- String2GPUInstructionType.put( "ua*" , GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication
- String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
- String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
- String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
- String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Max
- String2GPUInstructionType.put( "uarmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
- String2GPUInstructionType.put( "uacmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
- String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Min
- String2GPUInstructionType.put( "uarmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
- String2GPUInstructionType.put( "uacmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
- String2GPUInstructionType.put( "uasqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares
- String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares
- String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares
- String2GPUInstructionType.put( "uavar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance
- String2GPUInstructionType.put( "uarvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance
- String2GPUInstructionType.put( "uacvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance
+ String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+ String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+ String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+ String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+ String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+ String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+ String2GPUInstructionType.put( "ua*" , GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication
+ String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
+ String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
+ String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
+ String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Max
+ String2GPUInstructionType.put( "uarmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
+ String2GPUInstructionType.put( "uacmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
+ String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Min
+ String2GPUInstructionType.put( "uarmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
+ String2GPUInstructionType.put( "uacmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
+ String2GPUInstructionType.put( "uasqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares
+ String2GPUInstructionType.put( "uarsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares
+ String2GPUInstructionType.put( "uacsqk+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares
+ String2GPUInstructionType.put( "uavar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Variance
+ String2GPUInstructionType.put( "uarvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance
+ String2GPUInstructionType.put( "uacvar" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance
}
public static GPUInstruction parseSingleInstruction (String str )
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 673601f..89a2b67 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,25 +18,14 @@
*/
package org.apache.sysml.runtime.instructions.gpu.context;
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-
-import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.JCuda.cudaSetDevice;
-import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusolver.cusolverSpHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
import java.util.ArrayList;
import java.util.Collections;
@@ -56,14 +45,24 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.LRUCacheMap;
-import jcuda.Pointer;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
/**
* Represents a context per GPU accessible through the same JVM
@@ -159,6 +158,7 @@ public class GPUContext {
}
+ @SuppressWarnings("unused")
public int getDeviceNum() {
return deviceNum;
}
@@ -174,6 +174,7 @@ public class GPUContext {
cudaSetDevice(deviceNum);
}
+ @SuppressWarnings("unused")
public static int cudaGetDevice() {
int[] device = new int[1];
JCuda.cudaGetDevice(device);
@@ -288,6 +289,9 @@ public class GPUContext {
* @param eager true if to be done eagerly
*/
public void cudaFreeHelper(String instructionName, final Pointer toFree, boolean eager) {
+ Pointer dummy = new Pointer();
+ if (toFree == dummy) // trying to free a null pointer
+ return;
long t0 = 0;
assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
long size = cudaBlockSizeMap.get(toFree);
@@ -382,14 +386,14 @@ public class GPUContext {
return;
if (allocatedGPUObjects.size() == 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+ throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
}
Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
@Override
public int compare(GPUObject p1, GPUObject p2) {
- long p1Val = p1.readLocks.get();
- long p2Val = p2.readLocks.get();
+ long p1Val = p1.locks.get();
+ long p2Val = p2.locks.get();
if (p1Val > 0 && p2Val > 0) {
// Both are locked, so don't sort
@@ -426,8 +430,8 @@ public class GPUContext {
while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 0) {
GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
- if (toBeRemoved.readLocks.get() > 0) {
- throw new DMLRuntimeException("There is not enough memory on device for this matrix!");
+ if (toBeRemoved.locks.get() > 0) {
+ throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + neededSize + ")");
}
if (toBeRemoved.dirty) {
toBeRemoved.copyFromDeviceToHost();
@@ -546,6 +550,7 @@ public class GPUContext {
* @return the shared memory per block
* @throws DMLRuntimeException ?
*/
+ @SuppressWarnings("unused")
public long getMaxSharedMemory() throws DMLRuntimeException {
cudaDeviceProp deviceProp = getGPUProperties();
return deviceProp.sharedMemPerBlock;
@@ -588,10 +593,10 @@ public class GPUContext {
/**
* Destroys this GPUContext object
- * This method MUST BE called so that the GPU is available to be used again
*
* @throws DMLRuntimeException if error
*/
+ @SuppressWarnings("unused")
public void destroy() throws DMLRuntimeException {
LOG.trace("GPU : this context was destroyed, this = " + this.toString());
clearMemory();
@@ -608,14 +613,51 @@ public class GPUContext {
/**
* Clears all memory used by this {@link GPUContext}
- * Be careful to ensure that no memory is currently being used before invoking this
+ * Be careful to ensure that no memory is currently being used in the temporary memory before invoking this
+ * If memory is being used between MLContext invocations, they are pointed to by a {@link GPUObject} instance
+ * which would be part of the {@link MatrixObject}. The cleanup of that {@link MatrixObject} instance will
+ * cause the memory associated with that block on the GPU to be freed up.
* @throws DMLRuntimeException ?
*/
public void clearMemory() throws DMLRuntimeException {
- while (allocatedGPUObjects.isEmpty()) {
+ clearTemporaryMemory();
+ while (!allocatedGPUObjects.isEmpty()) {
GPUObject o = allocatedGPUObjects.get(0);
- o.clearData();
+ if (o.isDirty()){
+ LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is still on GPU memory, copying it back to host.");
+ o.acquireHostRead();
+ }
+ o.clearData(true);
}
+ allocatedGPUObjects.clear();
+ }
+
+ /**
+ * Clears up the memory used to optimize cudaMalloc/cudaFree calls
+ */
+ public void clearTemporaryMemory() {
+ // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
+ HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
+ for (GPUObject o : allocatedGPUObjects) {
+ if (o.isSparse()) {
+ CSRPointer p = o.getSparseMatrixCudaPointer();
+ if (p.rowPtr != null && cudaBlockSizeMap.containsKey(p.rowPtr)) {
+ tmpCudaBlockSizeMap.put(p.rowPtr, cudaBlockSizeMap.get(p.rowPtr));
+ }
+ if (p.colInd != null && cudaBlockSizeMap.containsKey(p.colInd)) {
+ tmpCudaBlockSizeMap.put(p.colInd, cudaBlockSizeMap.get(p.colInd));
+ }
+ if (p.val != null && cudaBlockSizeMap.containsKey(p.val)) {
+ tmpCudaBlockSizeMap.put(p.val, cudaBlockSizeMap.get(p.val));
+ }
+
+ } else {
+ Pointer p = o.getJcudaDenseMatrixPtr();
+ tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
+ }
+ }
+
+ // garbage collect all temporarily allocated spaces
for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
for (Pointer p : l) {
cudaFreeHelper(p, true);
@@ -623,7 +665,9 @@ public class GPUContext {
}
cudaBlockSizeMap.clear();
freeCUDASpaceMap.clear();
- allocatedGPUObjects.clear();
+
+ // Restore only those entries for which there are still blocks on the GPU
+ cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index be3cc09..0ed34c5 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -86,8 +86,8 @@ public class GPUObject {
/** whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host */
protected boolean dirty = false;
- /** number of read locks on this object */
- protected AtomicInteger readLocks = new AtomicInteger(0);
+ /** number of read/write locks on this object (this GPUObject is being used in a current instruction) */
+ protected AtomicInteger locks = new AtomicInteger(0);
/** Timestamp, needed by {@link GPUContext#evict(long)} */
AtomicLong timestamp = new AtomicLong(0);
@@ -112,7 +112,7 @@ public class GPUObject {
that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
}
that.dirty = me.dirty;
- that.readLocks = new AtomicInteger(me.readLocks.get());
+ that.locks = new AtomicInteger(me.locks.get());
that.timestamp = new AtomicLong(me.timestamp.get());
that.isSparse = me.isSparse;
@@ -126,7 +126,7 @@ public class GPUObject {
cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
}
- if (me.jcudaSparseMatrixPtr != null){
+ if (me.getJcudaSparseMatrixPtr() != null){
long rows = mat.getNumRows();
that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int)rows);
}
@@ -265,7 +265,6 @@ public class GPUObject {
/**
* Convenience method to directly set the sparse matrix on GPU
- * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
* Needed for operations like {@link JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
* @param sparseMatrixPtr CSR (compressed sparse row) pointer
*
@@ -278,11 +277,11 @@ public class GPUObject {
cudaFreeHelper(getJcudaDenseMatrixPtr());
jcudaDenseMatrixPtr = null;
}
+ getGPUContext().recordBlockUsage(this);
}
/**
* Convenience method to directly set the dense matrix pointer on GPU
- * Make sure to call {@link #addReadLock()} after this to set appropriate state, if you are not sure what you are doing.
*
* @param densePtr dense pointer
* @throws DMLRuntimeException ?
@@ -294,6 +293,7 @@ public class GPUObject {
getJcudaSparseMatrixPtr().deallocate();
jcudaSparseMatrixPtr = null;
}
+ getGPUContext().recordBlockUsage(this);
}
/**
@@ -491,7 +491,6 @@ public class GPUObject {
public void allocateSparseAndEmpty() throws DMLRuntimeException{
LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + getGPUContext());
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, mat.getNumRows()));
- addReadLock();
}
/**
@@ -508,7 +507,6 @@ public class GPUObject {
int numElems = toIntExact(rows * cols);
long size = getDoubleSizeOf(numElems);
setDenseMatrixCudaPointer(allocate(size));
- addReadLock();
// The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v"
// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
if (v != 0)
@@ -535,9 +533,8 @@ public class GPUObject {
LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
copyFromHostToDevice();
transferred = true;
- } else {
- addReadLock();
}
+ addLock();
if(!isAllocated())
throw new DMLRuntimeException("Expected device data to be allocated");
return transferred;
@@ -552,7 +549,6 @@ public class GPUObject {
// Dense block, size = numRows * numCols
allocateDenseMatrixOnDevice();
allocated = true;
- getGPUContext().recordBlockUsage(this);
}
dirty = true;
if(!isAllocated())
@@ -569,8 +565,6 @@ public class GPUObject {
mat.setDirty(true);
allocateSparseMatrixOnDevice();
allocated = true;
- getGPUContext().recordBlockUsage(this);
-
}
dirty = true;
if(!isAllocated())
@@ -578,8 +572,8 @@ public class GPUObject {
return allocated;
}
- public void addReadLock() {
- readLocks.addAndGet(1);
+ public void addLock() {
+ locks.addAndGet(1);
}
/**
@@ -606,11 +600,13 @@ public class GPUObject {
* Updates the locks depending on the eviction policy selected
* @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
*/
- private void updateReleaseLocks() throws DMLRuntimeException {
- if (readLocks.addAndGet(-1) < 0) {
- throw new CacheException("Redundant release of GPU object");
+ private void updateReleaseLocks(int l) throws DMLRuntimeException {
+ int newLocks = locks.addAndGet(l);
+ if (newLocks < 0) {
+ throw new CacheException("Internal state error : Invalid number of locks on a GPUObject");
}
- LOG.trace("GPU : updateReleaseLocks, new number of read locks is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+
+ LOG.trace("GPU : updateReleaseLocks, new number of locks is " + locks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
switch (evictionPolicy){
case LRU : timestamp.set(System.nanoTime()); break;
@@ -625,7 +621,8 @@ public class GPUObject {
* @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
*/
public void releaseInput() throws DMLRuntimeException {
- updateReleaseLocks();
+ // A read lock is a positive quantity, therefor when the lock is freed, a negative 1 is added
+ updateReleaseLocks(-1);
if(!isAllocated())
throw new CacheException("Attempting to release an input before allocating it");
}
@@ -635,7 +632,8 @@ public class GPUObject {
* @throws DMLRuntimeException if data is not allocated or if there is no locked GPU Object or if could not obtain a {@link GPUContext}
*/
public void releaseOutput() throws DMLRuntimeException {
- updateReleaseLocks();
+ // A write lock is a negative quantity, therefore when the lock is freed, a positive number is added
+ updateReleaseLocks(1);
dirty = true;
if(!isAllocated())
throw new CacheException("Attempting to release an output before allocating it");
@@ -651,7 +649,6 @@ public class GPUObject {
long size = getDoubleSizeOf(rows * cols);
Pointer tmp = allocate(size);
setDenseMatrixCudaPointer(tmp);
- addReadLock();
}
void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
@@ -660,10 +657,9 @@ public class GPUObject {
long rows = mat.getNumRows();
long nnz = mat.getNnz();
assert rows > 0 : "Internal error - invalid number of rows when allocating a sparse matrix";
- assert nnz > 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
+ assert nnz >= 0 : "Internal error - invalid number of non zeroes when allocating a sparse matrix";
CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, rows);
setSparseMatrixCudaPointer(tmp);
- addReadLock();
}
void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException {
@@ -680,7 +676,8 @@ public class GPUObject {
cudnnDestroyTensorDescriptor(tensorDescriptor);
tensorDescriptor = null;
}
- readLocks.set(0);
+ locks.set(0);
+ getGPUContext().removeRecordedUsage(this);
}
protected long getSizeOnDevice() throws DMLRuntimeException {
@@ -754,8 +751,8 @@ public class GPUObject {
colInd = csrBlock.indexes();
values = csrBlock.values();
}
+
allocateSparseMatrixOnDevice();
- getGPUContext().recordBlockUsage(this);
if(copyToDevice) {
CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
@@ -773,7 +770,6 @@ public class GPUObject {
// Copy dense block
allocateDenseMatrixOnDevice();
- getGPUContext().recordBlockUsage(this);
cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
}
@@ -862,9 +858,8 @@ public class GPUObject {
* @throws CacheException ?
*/
public void clearData(boolean eager) throws DMLRuntimeException {
- getGPUContext().removeRecordedUsage(this);
deallocateMemoryOnDevice(eager);
-
+ getGPUContext().removeRecordedUsage(this);
}
/**
@@ -896,7 +891,7 @@ public class GPUObject {
final StringBuilder sb = new StringBuilder("GPUObject{");
sb.append(", tensorShape=").append(Arrays.toString(tensorShape));
sb.append(", dirty=").append(dirty);
- sb.append(", readLocks=").append(readLocks);
+ sb.append(", locks=").append(locks);
sb.append(", sparse? ").append(isSparse);
sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
sb.append('}');
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index ac11df9..4d06831 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -34,6 +34,7 @@ import jcuda.Pointer;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.CUresult;
+import jcuda.runtime.JCuda;
/**
* Utility class that allows LibMatrixCUDA as well as GPUObject to invoke custom CUDA kernels.
@@ -111,7 +112,7 @@ public class JCudaKernels {
config.gridDimX, config.gridDimY, config.gridDimZ,
config.blockDimX, config.blockDimY, config.blockDimZ,
config.sharedMemBytes, config.stream, Pointer.to(kernelParams), null));
- //JCuda.cudaDeviceSynchronize();
+ JCuda.cudaDeviceSynchronize();
}
public static void checkResult(int cuResult) throws DMLRuntimeException {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index b023159..7990fef 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -235,8 +235,8 @@ public class LibMatrixCUDA {
private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
- private static Pointer _one;
+
+ private static Pointer _one;
private static Pointer _zero;
/**
* Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
@@ -258,7 +258,7 @@ public class LibMatrixCUDA {
}
return _zero;
}
-
+
/**
* Convenience method to get tensor descriptor from underlying GPUObject
* @param gCtx a valid {@link GPUContext}
@@ -272,7 +272,7 @@ public class LibMatrixCUDA {
*/
private static cudnnTensorDescriptor allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
- throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
+ throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
+ " || " + mat.getNumColumns() + " != " + (C*H*W));
}
return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, W);
@@ -293,7 +293,7 @@ public class LibMatrixCUDA {
cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
return tensorDescriptor;
}
-
+
/**
* Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
* @param gCtx a valid {@link GPUContext}
@@ -308,7 +308,7 @@ public class LibMatrixCUDA {
}
return getDensePointer(gCtx, image, instName);
}
-
+
/**
* Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
* @param gCtx a valid {@link GPUContext}
@@ -339,15 +339,15 @@ public class LibMatrixCUDA {
}
return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
}
-
+
/**
* Convenience method for checking the status of CuDNN kernel.
- *
+ *
* @param status status returned by CuDNN
* @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
*/
private static void checkStatus(int status) throws DMLRuntimeException {
- if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
+ if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
@@ -383,7 +383,7 @@ public class LibMatrixCUDA {
//cudaDeviceSynchronize;
biasAdd(gCtx, instName, outputBlock, bias, outputBlock);
}
-
+
public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
throws DMLRuntimeException {
@@ -530,7 +530,7 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
}
-
+
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
@@ -635,7 +635,7 @@ public class LibMatrixCUDA {
throw new DMLRuntimeException("Incorrect dimensions for running variance");
}
}
-
+
/**
* Performs the forward BatchNormalization layer computation for inference
* @param gCtx a valid {@link GPUContext}
@@ -650,21 +650,21 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image,
- MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
+ MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, double epsilon) throws DMLRuntimeException {
LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx);
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
+
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-
+
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
new MatrixObject[] {image}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
+
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -672,13 +672,13 @@ public class LibMatrixCUDA {
Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-
+
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
scaleTensorDesc, scalePtr, biasPtr,
runningMeanPtr, runningVarPtr, epsilon));
}
-
+
/**
* Performs the forward BatchNormalization layer computation for training
* @param gCtx a valid {@link GPUContext}
@@ -696,21 +696,21 @@ public class LibMatrixCUDA {
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image,
- MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
+ MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx);
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
+
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
-
+
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
new MatrixObject[] {image}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
+
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -718,20 +718,20 @@ public class LibMatrixCUDA {
Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, true, instName);
Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, instName);
-
+
// To allow for copy-on-write
Pointer retRunningMeanPtr = getDensePointer(gCtx, retRunningMean, true, instName);
Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, true, instName);
cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-
+
// ignoring resultSaveMean and resultSaveVariance as it requires state management
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
}
-
+
/**
* Convenient utility for batch normalization that returns a NCHW descriptor
* @param gCtx a valid {@link GPUContext}
@@ -776,8 +776,8 @@ public class LibMatrixCUDA {
H = HW; W = 1; // If not known
double potentialH = Math.sqrt(HW);
if(potentialH == ((int) potentialH)) {
- H = (int) potentialH;
- W = H;
+ H = (int) potentialH;
+ W = H;
}
// We are not sure about H and W, hence don't allocate them.
ret = new cudnnTensorDescriptor();
@@ -786,7 +786,7 @@ public class LibMatrixCUDA {
}
return ret;
}
-
+
/**
* This method computes the backpropagation errors for image, scale and bias of batch normalization layer
* @param gCtx a valid {@link GPUContext}
@@ -805,16 +805,16 @@ public class LibMatrixCUDA {
double epsilon) throws DMLRuntimeException {
LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-
+
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
-
+
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(gCtx, N, C, CHW,
new MatrixObject[] {image, dout}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-
+
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
@@ -822,7 +822,7 @@ public class LibMatrixCUDA {
Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
Pointer retScalePtr = getDensePointer(gCtx, retScale, true, instName);
Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, instName);
-
+
// ignoring resultSaveMean and resultSaveVariance as it requires state management
checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode, one(), zero(), one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
@@ -915,7 +915,7 @@ public class LibMatrixCUDA {
}
}
- private static long numDoublesIn2GB = 125000000;
+ private static long numDoublesIn2GB = 268435456;
/**
* This method computes the backpropogation errors for previous layer of convolution operation
@@ -961,7 +961,7 @@ public class LibMatrixCUDA {
Pointer w = getDensePointer(gCtx, filter, true, instName);
Pointer dy = getDensePointer(gCtx, dout, true, instName);
Pointer dx = getDensePointer(gCtx, output, true, instName);
-
+
int padding [] = { pad_h, pad_w };
int strides [] = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
@@ -999,7 +999,7 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
-
+
/**
* performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
* @param gCtx a valid {@link GPUContext}
@@ -1029,7 +1029,7 @@ public class LibMatrixCUDA {
cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
-
+
public static void performMaxpooling(GPUContext gCtx, String instName, Pointer x, cudnnTensorDescriptor xDesc,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
@@ -1064,7 +1064,7 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
-
+
/**
* Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
* This method computes the backpropogation errors for previous layer of maxpooling operation
@@ -1149,13 +1149,13 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
}
}
-
+
private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
long t0=0;
try {
LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
-
+
Pointer srcData = getDensePointer(gCtx, in, true, instName);
cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
cudnnCreateActivationDescriptor(activationDescriptor);
@@ -1175,7 +1175,7 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
}
}
-
+
/**
* Performs the relu operation on the GPU.
@@ -1580,7 +1580,6 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
- output.getGPUObject(gCtx).addReadLock();
}
/**
@@ -1671,7 +1670,6 @@ public class LibMatrixCUDA {
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
- output.getGPUObject(gCtx).addReadLock();
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k,
@@ -2079,7 +2077,7 @@ public class LibMatrixCUDA {
reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
- matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, clen, out, divideOp);
+ matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpRow);
@@ -2097,7 +2095,7 @@ public class LibMatrixCUDA {
reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
- matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, rlen, clen, out, divideOp);
+ matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);
gCtx.cudaFreeHelper(instName, tmpCol);
@@ -2348,14 +2346,21 @@ public class LibMatrixCUDA {
else if(op.fn instanceof Power) {
setOutputToConstant(ec, gCtx, instName, 1.0, outputName);
}
- else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
- setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
- }
- else if(op.fn instanceof Divide) {
- //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
- compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
- }
- else {
+ // TODO:
+ // x/0.0 is either +Infinity or -Infinity according to Java.
+ // In the context of a matrix, different elements of the matrix
+ // could have different values.
+ // If the IEEE 754 standard defines otherwise, this logic needs
+ // to be re-enabled and the Java computation logic for divide by zero
+ // needs to be revisited
+ //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) {
+ // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName);
+ //}
+ //else if(op.fn instanceof Divide) {
+ // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
+ // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
+ //}
+ else {
// TODO: Potential to optimize
matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op);
}
@@ -2790,7 +2795,6 @@ public class LibMatrixCUDA {
CSRPointer B = in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
ec.allocateGPUMatrixObject(outputName);
- out.getGPUObject(gCtx).addReadLock();
if (in1 == in2 && isLeftTransposed == true && isLeftTransposed == isRightTransposed) {
// Special case for transpose
@@ -3160,7 +3164,7 @@ public class LibMatrixCUDA {
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer C = getDensePointer(gCtx, out, instName);
-
+
long t1=0, t2=0;
if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
index fb62c41..bb0a9a8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
@@ -45,9 +45,6 @@ public class LibMatrixDatagen
{
private static final Log LOG = LogFactory.getLog(LibMatrixDatagen.class.getName());
private static final long PAR_NUMCELL_THRESHOLD = 512*1024; //Min 500k elements
- public static final String RAND_PDF_UNIFORM = "uniform";
- public static final String RAND_PDF_NORMAL = "normal";
- public static final String RAND_PDF_POISSON = "poisson";
private static IDSequence _seqRandInput = new IDSequence();
@@ -55,9 +52,9 @@ public class LibMatrixDatagen
//prevent instantiation via private constructor
}
- public static boolean isShortcutRandOperation( double min, double max, double sparsity, String pdf )
+ public static boolean isShortcutRandOperation( double min, double max, double sparsity, RandomMatrixGenerator.PDF pdf )
{
- return pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)
+ return pdf == RandomMatrixGenerator.PDF.UNIFORM
&& ( ( min == 0.0 && max == 0.0 ) //all zeros
||( sparsity==1.0d && min == max )); //equal values
}
@@ -137,27 +134,31 @@ public class LibMatrixDatagen
}
}
- public static RandomMatrixGenerator createRandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams)
+ public static RandomMatrixGenerator createRandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max, String distParams)
throws DMLRuntimeException
{
- RandomMatrixGenerator rgen = null;
-
- if ( pdf.equalsIgnoreCase(RAND_PDF_UNIFORM))
- rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max);
- else if ( pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
- rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp);
- else if ( pdf.equalsIgnoreCase(RAND_PDF_POISSON))
- {
- double mean = Double.NaN;
- try {
- mean = Double.parseDouble(distParams);
- } catch(NumberFormatException e) {
- throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams);
- }
- rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean);
- }
- else
- throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\"");
+ RandomMatrixGenerator.PDF pdf = RandomMatrixGenerator.PDF.valueOf(pdfStr.toUpperCase());
+ RandomMatrixGenerator rgen = null;
+ switch (pdf) {
+ case UNIFORM:
+ rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max);
+ break;
+ case NORMAL:
+ rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp);
+ break;
+ case POISSON:
+ double mean = Double.NaN;
+ try {
+ mean = Double.parseDouble(distParams);
+ } catch (NumberFormatException e) {
+ throw new DMLRuntimeException("Failed to parse Poisson distribution parameter: " + distParams);
+ }
+ rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, max, mean);
+ break;
+ default:
+ throw new DMLRuntimeException("Unsupported probability distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", \"normal\", or \"poisson\"");
+
+ }
return rgen;
}
@@ -202,11 +203,11 @@ public class LibMatrixDatagen
* (max-min)*prng.nextDouble() is still valid. This is done primarily to
* share the same code across different distributions.
*/
- double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
- double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+ double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+ double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
// Special case shortcuts for efficiency
- if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+ if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
if ( min == 0.0 && max == 0.0 ) { //all zeros
out.reset(rows, cols, true);
return;
@@ -288,8 +289,8 @@ public class LibMatrixDatagen
* (max-min)*prng.nextDouble() is still valid. This is done primarily to
* share the same code across different distributions.
*/
- double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
- double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+ double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+ double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
//determine the sparsity of output matrix (multi-threaded always invoked from CP):
//estimated NNZ is for entire matrix (nnz=0, if 0 initialized)
@@ -304,7 +305,7 @@ public class LibMatrixDatagen
}
//special case shortcuts for efficiency
- if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+ if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
if ( min == 0.0 && max == 0.0 ) { //all zeros
out.reset(rows, cols, false);
return;
@@ -497,8 +498,8 @@ public class LibMatrixDatagen
int cpb = rgen._colsPerBlock;
double sparsity = rgen._sparsity;
PRNGenerator valuePRNG = rgen._valuePRNG;
- double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._min : 0;
- double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? rgen._max : 1;
+ double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._min : 0;
+ double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? rgen._max : 1;
double range = max - min;
int clen = out.clen;
int estimatedNNzsPerRow = out.estimatedNNzsPerRow;
@@ -510,14 +511,19 @@ public class LibMatrixDatagen
// Setup Pseudo Random Number Generator for cell values based on 'pdf'.
if (valuePRNG == null) {
- if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM))
+ switch (rgen._pdf) {
+ case UNIFORM:
valuePRNG = new UniformPRNGenerator();
- else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
+ break;
+ case NORMAL:
valuePRNG = new NormalPRNGenerator();
- else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_POISSON))
+ break;
+ case POISSON:
valuePRNG = new PoissonPRNGenerator();
- else
+ break;
+ default:
throw new DMLRuntimeException("Unsupported distribution function for Rand: " + rgen._pdf);
+ }
}
// loop through row-block indices
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
index 1dc818c..b4c6c95 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
@@ -26,8 +26,15 @@ import org.apache.sysml.runtime.util.PoissonPRNGenerator;
import org.apache.sysml.runtime.util.UniformPRNGenerator;
public class RandomMatrixGenerator {
-
- String _pdf;
+
+ /**
+ * Types of Probability density functions
+ */
+ enum PDF {
+ NORMAL, UNIFORM, POISSON
+ }
+
+ PDF _pdf;
int _rows, _cols, _rowsPerBlock, _colsPerBlock;
double _sparsity, _mean;
double _min, _max;
@@ -36,25 +43,76 @@ public class RandomMatrixGenerator {
public RandomMatrixGenerator()
{
- _pdf = "";
+ _pdf = PDF.UNIFORM;
_rows = _cols = _rowsPerBlock = _colsPerBlock = -1;
_sparsity = 0.0;
_min = _max = Double.NaN;
_valuePRNG = null;
_mean = 1.0;
}
-
- public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException
+
+ /**
+ * Instantiates a Random number generator
+ * @param pdf probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @throws DMLRuntimeException if error
+ */
+ public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp) throws DMLRuntimeException
{
this(pdf, r, c, rpb, cpb, sp, Double.NaN, Double.NaN);
}
-
- public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
+
+ /**
+ * Instantiates a Random number generator
+ * @param pdfStr probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @param min minimum of range of random numbers
+ * @param max maximum of range of random numbers
+ * @throws DMLRuntimeException if error
+ */
+ public RandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
+ {
+ init(PDF.valueOf(pdfStr.toUpperCase()), r, c, rpb, cpb, sp, min, max);
+ }
+
+ /**
+ * Instantiates a Random number generator
+ * @param pdf probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @param min minimum of range of random numbers
+ * @param max maximum of range of random numbers
+ * @throws DMLRuntimeException if error
+ */
+ public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
{
init(pdf, r, c, rpb, cpb, sp, min, max);
}
-
- public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
+
+ /**
+ * Initializes internal data structures. Called by Constructor
+ * @param pdf probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @param min minimum of range of random numbers
+ * @param max maximum of range of random numbers
+ * @throws DMLRuntimeException if error
+ */
+ public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max) throws DMLRuntimeException
{
_pdf = pdf;
_rows = r;
@@ -67,13 +125,39 @@ public class RandomMatrixGenerator {
setupValuePRNG();
}
-
- public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
+
+ /**
+ * Instantiates a Random number generator with a specific poisson mean
+ * @param pdf probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @param min minimum of range of random numbers
+ * @param max maximum of range of random numbers
+ * @param mean the poisson mean
+ * @throws DMLRuntimeException if error
+ */
+ public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
{
init(pdf, r, c, rpb, cpb, sp, min, max, mean);
}
-
- public void init(String pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
+
+ /**
+ * Instantiates a Random number generator with a specific poisson mean
+ * @param pdf probability density function
+ * @param r number of rows
+ * @param c number of columns
+ * @param rpb rows per block
+ * @param cpb columns per block
+ * @param sp sparsity (0 = completely sparse, 1 = completely dense)
+ * @param min minimum of range of random numbers
+ * @param max maximum of range of random numbers
+ * @param mean the poisson mean
+ * @throws DMLRuntimeException if error
+ */
+ public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, double mean) throws DMLRuntimeException
{
_pdf = pdf;
_rows = r;
@@ -89,15 +173,20 @@ public class RandomMatrixGenerator {
protected void setupValuePRNG() throws DMLRuntimeException
{
- if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_NORMAL) )
+ switch (_pdf) {
+ case NORMAL:
_valuePRNG = new NormalPRNGenerator();
- else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_UNIFORM) )
+ break;
+ case UNIFORM:
_valuePRNG = new UniformPRNGenerator();
- else if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_POISSON) )
- {
+ break;
+ case POISSON:
if(_mean <= 0)
throw new DMLRuntimeException("Invalid parameter (" + _mean + ") for Poisson distribution.");
_valuePRNG = new PoissonPRNGenerator(_mean);
+ break;
+ default:
+ throw new DMLRuntimeException("Unsupported probability density function");
}
}
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
new file mode 100644
index 0000000..0b229f0
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests Aggregate Unary ops
+ */
+public class AggregateUnaryOpTests extends UnaryOpTestsBase {
+
+ private final static String TEST_NAME = "AggregateUnaryOpTests";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void sum() {
+ testSimpleUnaryOpMatrixOutput("sum", "gpu_uak+");
+ }
+
+ @Test
+ public void colSums() {
+ testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+");
+ }
+
+ @Test
+ public void rowSums() {
+ testSimpleUnaryOpMatrixOutput("rowSums", "gpu_uark+");
+ }
+
+ @Test
+ public void mult() {
+ testSimpleUnaryOpMatrixOutput("prod", "gpu_ua*");
+ }
+
+ @Test
+ public void mean() {
+ testSimpleUnaryOpMatrixOutput("mean", "gpu_uamean");
+ }
+
+ @Test
+ public void colMeans() {
+ testSimpleUnaryOpMatrixOutput("colMeans", "gpu_uacmean");
+ }
+
+ @Test
+ public void rowMeans() {
+ testSimpleUnaryOpMatrixOutput("rowMeans", "gpu_uarmean");
+ }
+
+ @Test
+ public void max() {
+ testSimpleUnaryOpMatrixOutput("max", "gpu_uamax");
+ }
+
+ @Test
+ public void rowMaxs() {
+ testSimpleUnaryOpMatrixOutput("rowMaxs", "gpu_uarmax");
+ }
+
+ @Test
+ public void colMaxs() {
+ testSimpleUnaryOpMatrixOutput("colMaxs", "gpu_uacmax");
+ }
+
+ @Test
+ public void min() {
+ testSimpleUnaryOpMatrixOutput("min", "gpu_uamin");
+ }
+
+ @Test
+ public void rowMins() {
+ testSimpleUnaryOpMatrixOutput("rowMins", "gpu_uarmin");
+ }
+
+ @Test
+ public void colMins() {
+ testSimpleUnaryOpMatrixOutput("colMins", "gpu_uacmin");
+ }
+
+ @Test
+ public void var() {
+ testSimpleUnaryOpMatrixOutput("var", "gpu_uavar");
+ }
+
+ @Test
+ public void colVars() {
+ testSimpleUnaryOpMatrixOutput("colVars", "gpu_uacvar");
+ }
+
+ @Test
+ public void rowVars() {
+ testSimpleUnaryOpMatrixOutput("rowVars", "gpu_uarvar");
+ }
+
+ @Test
+ public void sumsq() {
+ testUnaryOpMatrixOutput("out = sum(in1*in1)", "gpu_uasqk+", "in1", "out");
+ }
+
+ @Test
+ public void rowSumsqs() {
+ testUnaryOpMatrixOutput("out = rowSums(in1*in1)", "gpu_uarsqk+", "in1", "out");
+ }
+
+ @Test
+ public void colSumsqs() {
+ testUnaryOpMatrixOutput("out = colSums(in1*in1)", "gpu_uacsqk+", "in1", "out");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
new file mode 100644
index 0000000..f3d2b21
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests builtin binary ops on GPU
+ */
+public class BinaryOpTests extends GPUTests {
+
+ private final static String TEST_NAME = "BinaryOpTests";
+ private final int seed = 42;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_DIR, TEST_NAME);
+ getAndLoadTestConfiguration(TEST_NAME);
+ }
+
+ @Test
+ public void testSolve() {
+ // Test Ax = b
+ // Dimensions of A = m * n
+ // Dimensions of x = n * 1
+ // Dimensions of b = m * 1
+
+ double sparsity = 1.0; // Only dense matrices supported by "solve"
+ final int[] sides = { 32, 33, 128, 256, 513, 2049 };
+ for (int i = 0; i < sides.length; i++) {
+ for (int j = i; j < sides.length; j++) {
+ int m = sides[j];
+ int n = sides[i];
+ runSolveTest(sparsity, m, n);
+ }
+ }
+
+ }
+
+ /**
+ * Runs the test for solve (Ax = b) for input with given dimensions and sparsities
+ * A can be overdetermined (rows in A > columns in A)
+ *
+ * @param sparsity sparsity for the block A and b
+ * @param m rows in A
+ * @param n columns in A
+ */
+ protected void runSolveTest(double sparsity, int m, int n) {
+ String scriptStr = "x = solve(A, b)";
+ System.out.println("In solve, A[" + m + ", " + n + "], b[" + m + ", 1]");
+ Matrix A = generateInputMatrix(spark, m, n, sparsity, seed);
+ Matrix b = generateInputMatrix(spark, m, 1, sparsity, seed);
+ HashMap<String, Object> inputs = new HashMap<>();
+ inputs.put("A", A);
+ inputs.put("b", b);
+ List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, Arrays.asList("x"));
+ List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, Arrays.asList("x"));
+ assertHeavyHitterPresent("gpu_solve");
+ assertEqualObjects(outCPU.get(0), outGPU.get(0));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
new file mode 100644
index 0000000..8af9104
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.spark.sql.SparkSession;
+import org.apache.sysml.api.mlcontext.MLContext;
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.api.mlcontext.Script;
+import org.apache.sysml.api.mlcontext.ScriptFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.utils.Statistics;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+
+/**
+ * Parent class for all GPU tests
+ */
+public abstract class GPUTests extends AutomatedTestBase {
+
+ protected final static String TEST_DIR = "org/apache/sysml/api/mlcontext";
+ protected static SparkSession spark;
+ protected final double THRESHOLD = 1e-9; // for relative error
+
+ @BeforeClass
+ public static void beforeClass() {
+ spark = createSystemMLSparkSession("GPUTests", "local");
+ }
+
+ @AfterClass
+ public static void afterClass() {
+ spark.close();
+ }
+
+ /**
+ * Gets threshold for relative error in tests
+ *
+ * @return a valid threshold
+ */
+ protected double getTHRESHOLD() {
+ return THRESHOLD;
+ }
+
+ @After
+ public void tearDown() {
+ clearGPUMemory();
+ super.tearDown();
+ }
+
+ /**
+ * Clear out the memory on all GPUs
+ */
+ protected void clearGPUMemory() {
+ try {
+ int count = GPUContextPool.getDeviceCount();
+ int freeCount = GPUContextPool.getAvailableCount();
+ Assert.assertTrue("All GPUContexts have not been returned to the GPUContextPool", count == freeCount);
+ ArrayList<GPUContext> gpuContexts = new ArrayList<>();
+ for (int i = 0; i < count; i++) {
+ GPUContext gCtx = GPUContextPool.getFromPool();
+ gCtx.initializeThread();
+ gCtx.clearMemory();
+ gpuContexts.add(gCtx);
+ }
+ for (GPUContext gCtx : gpuContexts) {
+ GPUContextPool.returnToPool(gCtx);
+ }
+
+ } catch (DMLRuntimeException e) {
+ // Ignore
+ }
+ }
+
+ /**
+ * Generates a random input matrix with a given size and sparsity
+ *
+ * @param spark valid instance of {@link SparkSession}
+ * @param m number of rows
+ * @param n number of columns
+ * @param sparsity sparsity (1 = completely dense, 0 = completely sparse)
+ * @return a random matrix with given size and sparsity
+ */
+ protected Matrix generateInputMatrix(SparkSession spark, int m, int n, double sparsity, int seed) {
+ // Generate a random matrix of size m * n
+ MLContext genMLC = new MLContext(spark);
+ String scriptStr;
+ if (sparsity == 0.0) {
+ scriptStr = "in1 = matrix(0, rows=" + m + ", cols=" + n + ")";
+ } else {
+ scriptStr = "in1 = rand(rows=" + m + ", cols=" + n + ", sparsity = " + sparsity + ", seed= " + seed
+ + ", min=-1.0, max=1.0)";
+ }
+ Script generateScript = ScriptFactory.dmlFromString(scriptStr).out("in1");
+ Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
+ genMLC.close();
+ return in1;
+ }
+
+ /**
+ * Asserts that the values in two matrices are in {@link UnaryOpTests#THRESHOLD} of each other
+ *
+ * @param expected expected matrix
+ * @param actual actual matrix
+ */
+ private void assertEqualMatrices(Matrix expected, Matrix actual) {
+ try {
+ MatrixBlock expectedMB = expected.toMatrixObject().acquireRead();
+ MatrixBlock actualMB = actual.toMatrixObject().acquireRead();
+
+ long rows = expectedMB.getNumRows();
+ long cols = expectedMB.getNumColumns();
+ Assert.assertEquals(rows, actualMB.getNumRows());
+ Assert.assertEquals(cols, actualMB.getNumColumns());
+
+ for (int i = 0; i < rows; i++) {
+ for (int j = 0; j < cols; j++) {
+ double expectedDouble = expectedMB.quickGetValue(i, j);
+ double actualDouble = actualMB.quickGetValue(i, j);
+ if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+ double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+ Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError
+ + ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+ } else {
+ Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
+ }
+ }
+ }
+ expected.toMatrixObject().release();
+ actual.toMatrixObject().release();
+ } catch (DMLRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * asserts that the expected op was executed
+ *
+ * @param heavyHitterOpCode opcode of the heavy hitter for the unary op
+ */
+ protected void assertHeavyHitterPresent(String heavyHitterOpCode) {
+ Set<String> heavyHitterOpCodes = Statistics.getCPHeavyHitterOpCodes();
+ Assert.assertTrue(heavyHitterOpCodes.contains(heavyHitterOpCode));
+ }
+
+ /**
+ * Runs a program on the CPU
+ *
+ * @param spark a valid {@link SparkSession}
+ * @param scriptStr the script to run (as a string)
+ * @param inputs map of input variables names in the scriptStr (of variable_name -> object)
+ * @param outStrs list of variable names needed as output from the scriptStr
+ * @return list of output objects in order of outStrs
+ */
+ protected List<Object> runOnCPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
+ List<String> outStrs) {
+ MLContext cpuMLC = new MLContext(spark);
+ List<Object> outputs = new ArrayList<>();
+ Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+ for (String outStr : outStrs) {
+ Object output = cpuMLC.execute(script).get(outStr);
+ outputs.add(output);
+ }
+ cpuMLC.close();
+ return outputs;
+ }
+
+ /**
+ * Runs a program on the GPU
+ *
+ * @param spark a valid {@link SparkSession}
+ * @param scriptStr the script to run (as a string)
+ * @param inputs map of input variables names in the scriptStr (of variable_name -> object)
+ * @param outStrs list of variable names needed as output from the scriptStr
+ * @return list of output objects in order of outStrs
+ */
+ protected List<Object> runOnGPU(SparkSession spark, String scriptStr, Map<String, Object> inputs,
+ List<String> outStrs) {
+ MLContext gpuMLC = new MLContext(spark);
+ gpuMLC.setGPU(true);
+ gpuMLC.setForceGPU(true);
+ gpuMLC.setStatistics(true);
+ List<Object> outputs = new ArrayList<>();
+ Script script = ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+ for (String outStr : outStrs) {
+ Object output = gpuMLC.execute(script).get(outStr);
+ outputs.add(output);
+ }
+ gpuMLC.close();
+ return outputs;
+ }
+
+ /**
+ * Assert that the two objects are equal. Supported types are Boolean, Integer, String, Double and Matrix
+ *
+ * @param expected
+ * @param actual
+ */
+ protected void assertEqualObjects(Object expected, Object actual) {
+ Assert.assertEquals(expected.getClass(), actual.getClass());
+
+ if (expected instanceof Boolean) {
+ Assert.assertEquals(((Boolean) expected).booleanValue(), ((Boolean) actual).booleanValue());
+ } else if (expected instanceof Double) {
+ double expectedDouble = ((Double) expected).doubleValue();
+ double actualDouble = ((Double) actual).doubleValue();
+ if (expectedDouble != 0.0 && !Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+ double relativeError = Math.abs((expectedDouble - actualDouble) / expectedDouble);
+ Assert.assertTrue("Comparing floating point numbers, relative error(" + relativeError
+ + ") is more than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+ } else {
+ Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
+ }
+ } else if (expected instanceof String) {
+ Assert.assertEquals(expected.toString(), actual.toString());
+ } else if (expected instanceof Integer) {
+ Assert.assertEquals(((Integer) expected).intValue(), ((Integer) actual).intValue());
+ } else if (expected instanceof Matrix)
+ assertEqualMatrices((Matrix) expected, (Matrix) actual);
+ else {
+ Assert.fail("Invalid types for comparison");
+ }
+ }
+}