You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/07/28 12:07:00 UTC

[systemds] branch master updated: [SYSTEMDS-2592] New built-in function cor (correlation matrix)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new c14430a  [SYSTEMDS-2592] New built-in function cor (correlation matrix)
c14430a is described below

commit c14430a1895855422c4ced0b81089eb16eed2d00
Author: Olga Ovcharenko <ol...@student.tugraz.at>
AuthorDate: Tue Jul 28 14:06:05 2020 +0200

    [SYSTEMDS-2592] New built-in function cor (correlation matrix)
    
    Closes #1002.
---
 docs/site/run_issues.md                            |  6 ++
 pom.xml                                            | 13 ++--
 scripts/builtin/cor.dml                            | 25 +++++++
 .../java/org/apache/sysds/common/Builtins.java     | 43 +++++------
 .../builtin/BuiltinCorrelationMatrixTest.java      | 87 ++++++++++++++++++++++
 .../scripts/functions/builtin/correlationMatrix.R  | 27 +++++++
 .../functions/builtin/correlationMatrix.dml        | 24 ++++++
 7 files changed, 198 insertions(+), 27 deletions(-)

diff --git a/docs/site/run_issues.md b/docs/site/run_issues.md
new file mode 100644
index 0000000..6cc931b
--- /dev/null
+++ b/docs/site/run_issues.md
@@ -0,0 +1,6 @@
+Error: Could not find or load main class org.apache.sysds.api.DMLScript 
+
+Solution for macOS: Install `realpath` with Homebrew
+```bash
+brew install coreutils 
+```
diff --git a/pom.xml b/pom.xml
index 063e532..a5e5f92 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,8 +176,8 @@
 					</execution>
 				</executions>
 				<configuration>
-					<!-- Include signature files so that recent versions of Java will run 
-						the resulting jar without complaining about "Invalid signature file digest 
+					<!-- Include signature files so that recent versions of Java will run
+						the resulting jar without complaining about "Invalid signature file digest
 						for Manifest main attributes".
 						Furthermore, the excluded notice and license files will be explicitly
 						added by the resource transformers above -->
@@ -250,7 +250,7 @@
 				</executions>
 			</plugin>
 
-			
+
 			<plugin> <!-- unit tests -->
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-surefire-plugin</artifactId>
@@ -521,6 +521,7 @@
 								<exclude>src/main/python/docs/build/**/*</exclude>
 								<exclude>docs/api/**/*</exclude>
 								<exclude>docs/_site/**/*</exclude>
+								<exclude>docs/site/run_issues.md</exclude>
 								<exclude>docs/.jekyll-cache/**/*</exclude>
 								<exclude>docs/css/bootstrap.min.css</exclude>
 								<exclude>docs/css/pygments-default.css</exclude>
@@ -573,7 +574,7 @@
 		</profile>
 
 		<profile>
-			<!-- Profile to create binary distributions. Execute with `mvn clean package 
+			<!-- Profile to create binary distributions. Execute with `mvn clean package
 				-P distribution` -->
 			<id>distribution</id>
 			<build>
@@ -710,7 +711,7 @@
 			</build>
 		</profile>
 	</profiles>
-	
+
 	<dependencies>
 		<dependency>
 			<groupId>org.jcuda</groupId>
@@ -970,7 +971,7 @@
 			<version>0.10</version>
 			<scope>test</scope>
 		</dependency>
-		
+
 		<dependency>
 			<!--Used for annotations in tests to execute tests in thread safe manner-->
 			<groupId>com.github.stephenc.jcip</groupId>
diff --git a/scripts/builtin/cor.dml b/scripts/builtin/cor.dml
new file mode 100644
index 0000000..ea7cf53
--- /dev/null
+++ b/scripts/builtin/cor.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+m_cor = function(Matrix[Double] X) return (Matrix[Double] Y) {
+  # compute correlation matrix in vectorized form
+  Xc = X - colMeans(X);
+  Y = ((t(Xc) %*% Xc)/(nrow(X)-1)) / (t(colSds(X)) %*% colSds(X));
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 53fc39e..b6733d2 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -31,7 +31,7 @@ import org.apache.sysds.common.Types.ReturnType;
  * case of DML script, these functions are loaded during parsing. As
  * always, user-defined DML-bodied functions take precedence over all
  * builtin functions.
- * 
+ *
  * To add a new builtin script function, simply add the definition here
  * as well as a dml file in scripts/builtin with a matching name. On 
  * building SystemDS, these scripts are packaged into the jar as well.
@@ -82,6 +82,7 @@ public enum Builtins {
 	CUMSUM("cumsum", false),
 	CUMSUMPROD("cumsumprod", false),
 	CONFUSIONMATRIX("confusionMatrix", true),
+	COR("cor", true),
 	DETECTSCHEMA("detectSchema", false),
 	DIAG("diag", false),
 	DISCOVER_FD("discoverFD", true),
@@ -186,7 +187,7 @@ public enum Builtins {
 	VAR("var", false),
 	XOR("xor", false),
 	WINSORIZE("winsorize", true, false), //TODO parameterize w/ prob, min/max val
-	
+
 	//parameterized builtin functions
 	CDF("cdf", false, true),
 	GROUPEDAGG("aggregate", "groupedAggregate", false, true),
@@ -217,27 +218,27 @@ public enum Builtins {
 	TRANSFORMENCODE("transformencode", false, true),
 	TRANSFORMMETA("transformmeta", false, true),
 	UPPER_TRI("upper.tri", false, true);
-	
+
 	Builtins(String name, boolean script) {
 		this(name, null, script, false, ReturnType.SINGLE_RETURN);
 	}
-	
+
 	Builtins(String name, boolean script, ReturnType retType) {
 		this(name, null, script, false, retType);
 	}
-	
+
 	Builtins(String name, boolean script, boolean parameterized) {
 		this(name, null, script, parameterized, ReturnType.SINGLE_RETURN);
 	}
-	
+
 	Builtins(String name, String alias, boolean script) {
 		this(name, alias, script, false, ReturnType.SINGLE_RETURN);
 	}
-	
+
 	Builtins(String name, String alias, boolean script, boolean parameterized) {
 		this(name, alias, script, parameterized, ReturnType.SINGLE_RETURN);
 	}
-	
+
 	Builtins(String name, String alias, boolean script, boolean parameterized, ReturnType retType) {
 		_name = name;
 		_alias = alias;
@@ -245,10 +246,10 @@ public enum Builtins {
 		_parameterized = parameterized;
 		_retType = retType;
 	}
-	
+
 	private final static String BUILTIN_DIR = "scripts/builtin/";
 	private final static HashMap<String, Builtins> _map = new HashMap<>();
-	
+
 	static {
 		//materialize lookup map for all builtin names
 		for( Builtins b : EnumSet.allOf(Builtins.class) ) {
@@ -257,52 +258,52 @@ public enum Builtins {
 				_map.put(b.getAlias(), b);
 		}
 	}
-	
+
 	private final String _name;
 	private final String _alias;
 	private final boolean _script;
 	private final boolean _parameterized;
 	private final ReturnType _retType;
-	
+
 	public String getName() {
 		return _name;
 	}
-	
+
 	public String getAlias() {
 		return _alias;
 	}
-	
+
 	public boolean isScript() {
 		return _script;
 	}
-	
+
 	public boolean isParameterized() {
 		return _parameterized;
 	}
-	
+
 	public boolean isMultiReturn() {
 		return _retType == ReturnType.MULTI_RETURN;
 	}
-	
+
 	public static boolean contains(String name, boolean script, boolean parameterized) {
 		Builtins tmp = get(name);
 		return tmp != null && script == tmp.isScript()
 			&& parameterized == tmp.isParameterized();
 	}
-	
+
 	public static Builtins get(String name) {
 		if( name.equals("list") )
 			return LIST; //unparameterized
 		return _map.get(name);
 	}
-	
+
 	public static Builtins get(String name, boolean params) {
 		if( name.equals("list") )
 			return params ? LISTNV : LIST;
 		Builtins tmp = get(name);
 		return tmp != null && (params == tmp.isParameterized()) ? tmp : null;
 	}
-	
+
 	public static String getFilePath(String name) {
 		StringBuilder sb = new StringBuilder();
 		sb.append(BUILTIN_DIR);
@@ -310,7 +311,7 @@ public enum Builtins {
 		sb.append(".dml");
 		return sb.toString();
 	}
-	
+
 	public static String getInternalFName(String name, DataType dt) {
 		return (dt.isMatrix() ? "m_" : "s_") + name;
 	}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
new file mode 100644
index 0000000..26e3b02
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCorrelationMatrixTest extends AutomatedTestBase
+{
+	private final static String TEST_NAME = "correlationMatrix";
+	private final static String TEST_DIR = "functions/builtin/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinCorrelationMatrixTest.class.getSimpleName() + "/";
+	
+	private final static double eps = 1e-3;
+	private final static int rows = 1765;
+	private final static double spDense = 0.99;
+	
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"})); 
+	}
+
+	@Test
+	public void testCorrelationMatrixDefaultCP() {
+		runCorrelationMatrix(true, ExecType.CP);
+	}
+	
+	@Test
+	public void testCorrelationMatrixDefaultSP() {
+		runCorrelationMatrix(true, ExecType.SPARK);
+	}
+
+	private void runCorrelationMatrix(boolean defaultProb, ExecType instType)
+	{
+		ExecMode platformOld = setExecMode(instType);
+		
+		try
+		{
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{"-args", input("A"), output("B") };
+			fullRScriptName = HOME + TEST_NAME + ".R";
+			rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir();
+			
+			//generate actual dataset 
+			double[][] A = getRandomMatrix(rows, 10, -1, 1, spDense, 7);
+			writeInputMatrixWithMTD("A", A, true);
+			
+			runTest(true, false, null, -1);
+			runRScript(true);
+			
+			//compare matrices
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("B");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+		}
+		finally {
+			rtplatform = platformOld;
+		}
+	}
+}
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.R b/src/test/scripts/functions/builtin/correlationMatrix.R
new file mode 100644
index 0000000..57bc080
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.R
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+R = cor(X);
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "B", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.dml b/src/test/scripts/functions/builtin/correlationMatrix.dml
new file mode 100644
index 0000000..d3a9b48
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+Y = cor(X);
+write(Y, $2);