You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/07/28 12:07:00 UTC
[systemds] branch master updated: [SYSTEMDS-2592] New built-in
function cor (correlation matrix)
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new c14430a [SYSTEMDS-2592] New built-in function cor (correlation matrix)
c14430a is described below
commit c14430a1895855422c4ced0b81089eb16eed2d00
Author: Olga Ovcharenko <ol...@student.tugraz.at>
AuthorDate: Tue Jul 28 14:06:05 2020 +0200
[SYSTEMDS-2592] New built-in function cor (correlation matrix)
Closes #1002.
---
docs/site/run_issues.md | 6 ++
pom.xml | 13 ++--
scripts/builtin/cor.dml | 25 +++++++
.../java/org/apache/sysds/common/Builtins.java | 43 +++++------
.../builtin/BuiltinCorrelationMatrixTest.java | 87 ++++++++++++++++++++++
.../scripts/functions/builtin/correlationMatrix.R | 27 +++++++
.../functions/builtin/correlationMatrix.dml | 24 ++++++
7 files changed, 198 insertions(+), 27 deletions(-)
diff --git a/docs/site/run_issues.md b/docs/site/run_issues.md
new file mode 100644
index 0000000..6cc931b
--- /dev/null
+++ b/docs/site/run_issues.md
@@ -0,0 +1,6 @@
+Error: Could not find or load main class org.apache.sysds.api.DMLScript
+
+Solution for macOS: Install `realpath` with Homebrew
+```bash
+brew install coreutils
+```
diff --git a/pom.xml b/pom.xml
index 063e532..a5e5f92 100644
--- a/pom.xml
+++ b/pom.xml
@@ -176,8 +176,8 @@
</execution>
</executions>
<configuration>
- <!-- Include signature files so that recent versions of Java will run
- the resulting jar without complaining about "Invalid signature file digest
+ <!-- Include signature files so that recent versions of Java will run
+ the resulting jar without complaining about "Invalid signature file digest
for Manifest main attributes".
Furthermore, the excluded notice and license files will be explicitly
added by the resource transformers above -->
@@ -250,7 +250,7 @@
</executions>
</plugin>
-
+
<plugin> <!-- unit tests -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
@@ -521,6 +521,7 @@
<exclude>src/main/python/docs/build/**/*</exclude>
<exclude>docs/api/**/*</exclude>
<exclude>docs/_site/**/*</exclude>
+ <exclude>docs/site/run_issues.md</exclude>
<exclude>docs/.jekyll-cache/**/*</exclude>
<exclude>docs/css/bootstrap.min.css</exclude>
<exclude>docs/css/pygments-default.css</exclude>
@@ -573,7 +574,7 @@
</profile>
<profile>
- <!-- Profile to create binary distributions. Execute with `mvn clean package
+ <!-- Profile to create binary distributions. Execute with `mvn clean package
-P distribution` -->
<id>distribution</id>
<build>
@@ -710,7 +711,7 @@
</build>
</profile>
</profiles>
-
+
<dependencies>
<dependency>
<groupId>org.jcuda</groupId>
@@ -970,7 +971,7 @@
<version>0.10</version>
<scope>test</scope>
</dependency>
-
+
<dependency>
<!--Used for annotations in tests to execute tests in thread safe manner-->
<groupId>com.github.stephenc.jcip</groupId>
diff --git a/scripts/builtin/cor.dml b/scripts/builtin/cor.dml
new file mode 100644
index 0000000..ea7cf53
--- /dev/null
+++ b/scripts/builtin/cor.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+m_cor = function(Matrix[Double] X) return (Matrix[Double] Y) {
+ # compute correlation matrix in vectorized form
+ Xc = X - colMeans(X);
+ Y = ((t(Xc) %*% Xc)/(nrow(X)-1)) / (t(colSds(X)) %*% colSds(X));
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 53fc39e..b6733d2 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -31,7 +31,7 @@ import org.apache.sysds.common.Types.ReturnType;
* case of DML script, these functions are loaded during parsing. As
* always, user-defined DML-bodied functions take precedence over all
* builtin functions.
- *
+ *
* To add a new builtin script function, simply add the definition here
* as well as a dml file in scripts/builtin with a matching name. On
* building SystemDS, these scripts are packaged into the jar as well.
@@ -82,6 +82,7 @@ public enum Builtins {
CUMSUM("cumsum", false),
CUMSUMPROD("cumsumprod", false),
CONFUSIONMATRIX("confusionMatrix", true),
+ COR("cor", true),
DETECTSCHEMA("detectSchema", false),
DIAG("diag", false),
DISCOVER_FD("discoverFD", true),
@@ -186,7 +187,7 @@ public enum Builtins {
VAR("var", false),
XOR("xor", false),
WINSORIZE("winsorize", true, false), //TODO parameterize w/ prob, min/max val
-
+
//parameterized builtin functions
CDF("cdf", false, true),
GROUPEDAGG("aggregate", "groupedAggregate", false, true),
@@ -217,27 +218,27 @@ public enum Builtins {
TRANSFORMENCODE("transformencode", false, true),
TRANSFORMMETA("transformmeta", false, true),
UPPER_TRI("upper.tri", false, true);
-
+
Builtins(String name, boolean script) {
this(name, null, script, false, ReturnType.SINGLE_RETURN);
}
-
+
Builtins(String name, boolean script, ReturnType retType) {
this(name, null, script, false, retType);
}
-
+
Builtins(String name, boolean script, boolean parameterized) {
this(name, null, script, parameterized, ReturnType.SINGLE_RETURN);
}
-
+
Builtins(String name, String alias, boolean script) {
this(name, alias, script, false, ReturnType.SINGLE_RETURN);
}
-
+
Builtins(String name, String alias, boolean script, boolean parameterized) {
this(name, alias, script, parameterized, ReturnType.SINGLE_RETURN);
}
-
+
Builtins(String name, String alias, boolean script, boolean parameterized, ReturnType retType) {
_name = name;
_alias = alias;
@@ -245,10 +246,10 @@ public enum Builtins {
_parameterized = parameterized;
_retType = retType;
}
-
+
private final static String BUILTIN_DIR = "scripts/builtin/";
private final static HashMap<String, Builtins> _map = new HashMap<>();
-
+
static {
//materialize lookup map for all builtin names
for( Builtins b : EnumSet.allOf(Builtins.class) ) {
@@ -257,52 +258,52 @@ public enum Builtins {
_map.put(b.getAlias(), b);
}
}
-
+
private final String _name;
private final String _alias;
private final boolean _script;
private final boolean _parameterized;
private final ReturnType _retType;
-
+
public String getName() {
return _name;
}
-
+
public String getAlias() {
return _alias;
}
-
+
public boolean isScript() {
return _script;
}
-
+
public boolean isParameterized() {
return _parameterized;
}
-
+
public boolean isMultiReturn() {
return _retType == ReturnType.MULTI_RETURN;
}
-
+
public static boolean contains(String name, boolean script, boolean parameterized) {
Builtins tmp = get(name);
return tmp != null && script == tmp.isScript()
&& parameterized == tmp.isParameterized();
}
-
+
public static Builtins get(String name) {
if( name.equals("list") )
return LIST; //unparameterized
return _map.get(name);
}
-
+
public static Builtins get(String name, boolean params) {
if( name.equals("list") )
return params ? LISTNV : LIST;
Builtins tmp = get(name);
return tmp != null && (params == tmp.isParameterized()) ? tmp : null;
}
-
+
public static String getFilePath(String name) {
StringBuilder sb = new StringBuilder();
sb.append(BUILTIN_DIR);
@@ -310,7 +311,7 @@ public enum Builtins {
sb.append(".dml");
return sb.toString();
}
-
+
public static String getInternalFName(String name, DataType dt) {
return (dt.isMatrix() ? "m_" : "s_") + name;
}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
new file mode 100644
index 0000000..26e3b02
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinCorrelationMatrixTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinCorrelationMatrixTest extends AutomatedTestBase
+{
+ private final static String TEST_NAME = "correlationMatrix";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinCorrelationMatrixTest.class.getSimpleName() + "/";
+
+ private final static double eps = 1e-3;
+ private final static int rows = 1765;
+ private final static double spDense = 0.99;
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"}));
+ }
+
+ @Test
+ public void testCorrelationMatrixDefaultCP() {
+ runCorrelationMatrix(true, ExecType.CP);
+ }
+
+ @Test
+ public void testCorrelationMatrixDefaultSP() {
+ runCorrelationMatrix(true, ExecType.SPARK);
+ }
+
+ private void runCorrelationMatrix(boolean defaultProb, ExecType instType)
+ {
+ ExecMode platformOld = setExecMode(instType);
+
+ try
+ {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{"-args", input("A"), output("B") };
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir();
+
+ //generate actual dataset
+ double[][] A = getRandomMatrix(rows, 10, -1, 1, spDense, 7);
+ writeInputMatrixWithMTD("A", A, true);
+
+ runTest(true, false, null, -1);
+ runRScript(true);
+
+ //compare matrices
+ HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
+ HashMap<CellIndex, Double> rfile = readRMatrixFromFS("B");
+ TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.R b/src/test/scripts/functions/builtin/correlationMatrix.R
new file mode 100644
index 0000000..57bc080
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.R
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+R = cor(X);
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "B", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/correlationMatrix.dml b/src/test/scripts/functions/builtin/correlationMatrix.dml
new file mode 100644
index 0000000..d3a9b48
--- /dev/null
+++ b/src/test/scripts/functions/builtin/correlationMatrix.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+Y = cor(X);
+write(Y, $2);