You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/10/10 17:43:36 UTC
[systemds] branch master updated: [SYSTEMDS-2682/3] New Lasso and
PPCA built-in functions
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new e929457 [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions
e929457 is described below
commit e929457e2f1850e6f1c2b5c490523f9526e51be5
Author: Sven Celin <sv...@gmail.com>
AuthorDate: Sat Oct 10 19:38:10 2020 +0200
[SYSTEMDS-2682/3] New Lasso and PPCA built-in functions
AMLS project SS2020.
Closes #1071.
---
scripts/builtin/lasso.dml | 126 +++++++++++++++++
scripts/builtin/ppca.dml | 149 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 2 +
.../test/functions/builtin/BuiltinLassoTest.java | 70 ++++++++++
.../test/functions/builtin/BuiltinPPCATest.java | 74 ++++++++++
src/test/scripts/functions/builtin/lasso.dml | 25 ++++
src/test/scripts/functions/builtin/ppca.dml | 26 ++++
7 files changed, 472 insertions(+)
diff --git a/scripts/builtin/lasso.dml b/scripts/builtin/lasso.dml
new file mode 100644
index 0000000..7b6b464
--- /dev/null
+++ b/scripts/builtin/lasso.dml
@@ -0,0 +1,126 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function for the SpaRSA algorithm to perform lasso regression
+# (SpaRSA .. Sparse Reconstruction by Separable Approximation)
+#
+# INPUTS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X Double --- input feature matrix
+# y Double --- matrix Y columns of the design matrix
+# tol Double 1e-15 target convergence tolerance
+# M Integer 5 history length
+# tau Double 1 regularization component
+# maxi Integer 100 maximum number of iterations until convergence
+# ---------------------------------------------------------------------------------------------
+# OUTPUTS
+# w Double --- model matrix
+
+
+m_lasso = function(Matrix[Double] X, Matrix[Double] y, Double tol = 10^(-15),
+ Integer M = 5, Double tau = 1, Integer maxi = 100, Boolean verbose = TRUE)
+ return(Matrix[Double] w)
+{
+ n = nrow(X)
+ m = ncol(X)
+
+ #constants
+ eta = 2
+ sigma = 0.01
+ alpha_min = 10^(-30)
+ alpha_max = 10^(30)
+
+ #init
+ alpha = 1
+ w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
+ history = -1e30 * matrix(1, rows=M, cols=1)
+
+ r = X %*% w - y
+ g = t(X) %*% r
+ obj = 0.5 * sum(r*r) + tau*sum(abs(w))
+
+ if( verbose )
+ print("Initial OBJ=" + obj)
+
+ history[M,1] = obj
+
+ inactive_set = matrix(1, rows=m, cols=1)
+ iter = 0
+ continue = TRUE
+ while(iter < maxiter & continue) {
+ dw = matrix(0, rows=m, cols=1)
+ dg = matrix(0, rows=m, cols=1)
+ relChangeObj = -1.0
+
+ inner_iter = 0
+ inner_continue = TRUE
+ inner_maxiter = 100
+ while(inner_iter < inner_maxiter & inner_continue) {
+ u = w - g/alpha
+ lambda = tau/alpha
+
+ wnew = sign(u) * (abs(u) - lambda) * ((abs(u) - lambda) > 0)
+ dw = wnew - w
+ dw2 = sum(dw*dw)
+
+ r = X %*% wnew - y
+ gnew = t(X) %*% r
+ objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))
+ obj_threshold = max(history) - 0.5*sigma*alpha*dw2
+
+ if(objnew <= obj_threshold) {
+ w = wnew
+ dg = gnew - g
+ g = gnew
+ inner_continue = FALSE
+
+ history[1:(M-1),] = history[2:M,]
+ history[M,1] = objnew
+ relChangeObj = abs(objnew - obj)/obj
+ obj = objnew
+ }
+ else
+ alpha = eta*alpha
+
+ inner_iter = inner_iter + 1
+ }
+
+ if(inner_continue)
+ print("Inner loop did not converge")
+
+ alphanew = sum(dw*dg)/sum(dw*dw)
+ alpha = max(alpha_min, min(alpha_max, alphanew))
+
+ old_inactive_set = inactive_set
+ inactive_set = w != 0
+ diff = sum(abs(old_inactive_set - inactive_set))
+
+ if(diff == 0 & relChangeObj < tol)
+ continue = FALSE
+
+ num_inactive = sum(w != 0)
+ if( verbose )
+ print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
+ iter = iter + 1
+ }
+}
diff --git a/scripts/builtin/ppca.dml b/scripts/builtin/ppca.dml
new file mode 100644
index 0000000..2683f90
--- /dev/null
+++ b/scripts/builtin/ppca.dml
@@ -0,0 +1,149 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X Matrix --- n x m input feature matrix
+# k Integer --- indicates dimension of the new vector space constructed from eigen vectors
+# maxi Integer --- maximum number of iterations until convergence
+# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm
+# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm
+# verbose Boolen TRUE verbose debug output
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# Xout Matrix --- Output feature matrix with K columns
+# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
+
+
+m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10,
+ Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
+ return(Matrix[Double] Xout, Matrix[Double] Mout)
+{
+ n = nrow(X);
+ m = ncol(X);
+
+ #initializing principal components matrix
+ C = rand(rows=m, cols=K, pdf="normal");
+ ss = rand(rows=1, cols=1, pdf="normal");
+ ss = as.scalar(ss);
+ ssPrev = ss;
+
+ # best selected principle components - with the lowest reconstruction error
+ PC = C;
+
+ # initilizing reconstruction error
+ RE = tolrecerr+1;
+ REBest = RE;
+
+ Z = matrix(0,rows=1,cols=1);
+
+ #Objective function value
+ ObjRelChng = tolobj+1;
+
+ # mean centered input matrix - dim -> [n,m]
+ Xm = X - colMeans(X);
+
+ #I -> k x k
+ ITMP = matrix(1,rows=K,cols=1);
+ I = diag(ITMP);
+
+ i = 0;
+ while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){
+ #Estimation step - Covariance matrix
+ #M -> k x k
+ M = t(C) %*% C + I*ss;
+
+ #Auxilary matrix with n latent variables
+ # Z -> n x k
+ Z = Xm %*% (C %*% inv(M));
+
+ #ZtZ -> k x k
+ ZtZ = t(Z) %*% Z + inv(M)*ss;
+
+ #XtZ -> m x k
+ XtZ = t(Xm) %*% Z;
+
+ #Maximization step
+ #C -> m x k
+ ZtZ_sum = sum(ZtZ); #+n*inv(M));
+ C = XtZ/ZtZ_sum;
+
+ #ss2 -> 1 x 1
+ ss2 = trace(ZtZ * (t(C) %*% C));
+
+ #ss3 -> 1 x 1
+ ss3 = sum((Z %*% t(C)) %*% t(Xm));
+
+ #Frobenius norm of reconstruction error -> Euclidean norm
+ #Fn -> 1 x 1
+ Fn = sum(Xm*Xm);
+
+ #ss -> 1 x 1
+ ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+ #calculating objective function relative change
+ ObjRelChng = abs(1 - ss/ssPrev);
+ #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+ #Reconstruction error
+ R = ((Z %*% t(C)) - Xm);
+
+ #calculate the error
+ #TODO rethink calculation of reconstruction error ....
+ #1-Norm of reconstruction error - a big dense matrix
+ #RE -> n x m
+ RE = abs(sum(R)/sum(Xm));
+ if (RE < REBest){
+ PC = C;
+ REBest = RE;
+ }
+ #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+ ssPrev = ss;
+ i = i+1;
+ }
+ if( verbose )
+ print("Objective Relative Change: " + ObjRelChng);
+ if( verbose )
+ print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+ # reconstructs data
+ # RD -> n x k
+ Xout = X %*% PC;
+
+ # calculate eigenvalues - principle component variance
+ RDMean = colMeans(Xout);
+ V = t(colMeans(Xout^2) - (RDMean^2));
+
+ # sorting eigenvalues and eigenvectors in decreasing order
+ V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+ VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+ Mout = PC %*% VF_decr; # vectors (values via VF_decr %*% V)
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 92f6e48..157a411 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -120,6 +120,7 @@ public enum Builtins {
ISINF("is.infinite", false),
KMEANS("kmeans", true),
L2SVM("l2svm", true),
+ LASSO("lasso", true),
LENGTH("length", false),
LINEAGE("lineage", false),
LIST("list", false), //note: builtin and parbuiltin
@@ -155,6 +156,7 @@ public enum Builtins {
OUTLIER_IQR("outlierByIQR", true),
PCA("pca", true),
PNMF("pnmf", true),
+ PPCA("ppca", true),
PPRED("ppred", false),
PROD("prod", false),
QR("qr", false, ReturnType.MULTI_RETURN),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java
new file mode 100644
index 0000000..db94b04
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BuiltinLassoTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "lasso";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinLassoTest.class.getSimpleName() + "/";
+
+ private final static int rows = 100;
+ private final static int cols = 10;
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+ }
+
+ @Test
+ public void testLasso() {
+ runLassoTest();
+ }
+
+ private void runLassoTest() {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ List<String> proArgs = new ArrayList<>();
+
+ proArgs.add("-args");
+ proArgs.add(input("X"));
+ proArgs.add(input("y"));
+ proArgs.add(output("w"));
+ programArgs = proArgs.toArray(new String[proArgs.size()]);
+ double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1);
+ double[][] y = getRandomMatrix(rows, 1, 0, 1, 0.8, -1);
+ writeInputMatrixWithMTD("X", X, true);
+ writeInputMatrixWithMTD("y", y, true);
+
+ runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+ MatrixCharacteristics mc = readDMLMetaDataFile("w");
+ Assert.assertEquals(cols, mc.getRows());
+ Assert.assertEquals(1, mc.getCols());
+ }
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java
new file mode 100644
index 0000000..c1b96cd
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BuiltinPPCATest extends AutomatedTestBase {
+ private final static String TEST_NAME = "ppca";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinPPCATest.class.getSimpleName() + "/";
+
+ private final static int rows = 200;
+ private final static int cols = 20;
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"PC","V"}));
+ }
+
+ @Test
+ public void testPpca4() {
+ runPPCATest(4);
+ }
+
+ @Test
+ public void testPpca16() {
+ runPPCATest(16);
+ }
+
+ private void runPPCATest(int k) {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ List<String> proArgs = new ArrayList<>();
+
+ proArgs.add("-args");
+ proArgs.add(input("X"));
+ proArgs.add(String.valueOf(k));
+ proArgs.add(output("PC"));
+ proArgs.add(output("V"));
+ programArgs = proArgs.toArray(new String[proArgs.size()]);
+ double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1);
+ writeInputMatrixWithMTD("X", X, true);
+
+ runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+ MatrixCharacteristics mc = readDMLMetaDataFile("PC");
+ Assert.assertEquals(rows, mc.getRows());
+ Assert.assertEquals(k, mc.getCols());
+ }
+}
diff --git a/src/test/scripts/functions/builtin/lasso.dml b/src/test/scripts/functions/builtin/lasso.dml
new file mode 100644
index 0000000..0a2e3e9
--- /dev/null
+++ b/src/test/scripts/functions/builtin/lasso.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+y = read($2)
+w = lasso(X = X, y = y)
+write(w, $3)
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/ppca.dml b/src/test/scripts/functions/builtin/ppca.dml
new file mode 100644
index 0000000..2699ccc
--- /dev/null
+++ b/src/test/scripts/functions/builtin/ppca.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+k = $2;
+[PC, V] = ppca(X=X, K=k)
+write(PC, $3)
+write(V, $4)