You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/10/10 17:43:36 UTC

[systemds] branch master updated: [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new e929457  [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions
e929457 is described below

commit e929457e2f1850e6f1c2b5c490523f9526e51be5
Author: Sven Celin <sv...@gmail.com>
AuthorDate: Sat Oct 10 19:38:10 2020 +0200

    [SYSTEMDS-2682/3] New Lasso and PPCA built-in functions
    
    AMLS project SS2020.
    Closes #1071.
---
 scripts/builtin/lasso.dml                          | 126 +++++++++++++++++
 scripts/builtin/ppca.dml                           | 149 +++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   2 +
 .../test/functions/builtin/BuiltinLassoTest.java   |  70 ++++++++++
 .../test/functions/builtin/BuiltinPPCATest.java    |  74 ++++++++++
 src/test/scripts/functions/builtin/lasso.dml       |  25 ++++
 src/test/scripts/functions/builtin/ppca.dml        |  26 ++++
 7 files changed, 472 insertions(+)

diff --git a/scripts/builtin/lasso.dml b/scripts/builtin/lasso.dml
new file mode 100644
index 0000000..7b6b464
--- /dev/null
+++ b/scripts/builtin/lasso.dml
@@ -0,0 +1,126 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function for the SpaRSA algorithm to perform lasso regression
+# (SpaRSA .. Sparse Reconstruction by Separable Approximation)
+#
+# INPUTS:
+# ---------------------------------------------------------------------------------------------
+# NAME            TYPE     DEFAULT     MEANING
+# ---------------------------------------------------------------------------------------------
+# X               Double   ---         input feature matrix
+# y               Double   ---         matrix Y columns of the design matrix
+# tol             Double   1e-15       target convergence tolerance
+# M               Integer  5           history length
+# tau             Double   1           regularization component
+# maxi            Integer  100         maximum number of iterations until convergence
+# ---------------------------------------------------------------------------------------------
+# OUTPUTS
+# w               Double   ---        model matrix
+
+
+m_lasso = function(Matrix[Double] X, Matrix[Double] y, Double tol = 10^(-15), 
+  Integer M = 5, Double tau = 1, Integer maxi = 100, Boolean verbose = TRUE)
+  return(Matrix[Double] w)
+{
+  n = nrow(X)
+  m = ncol(X)
+
+  #constants
+  eta = 2
+  sigma = 0.01
+  alpha_min = 10^(-30)
+  alpha_max = 10^(30)
+
+  #init
+  alpha = 1
+  w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
+  history = -1e30 * matrix(1, rows=M, cols=1)
+
+  r = X %*% w - y
+  g = t(X) %*% r
+  obj = 0.5 * sum(r*r) + tau*sum(abs(w))
+
+  if( verbose )
+    print("Initial OBJ=" + obj)
+
+  history[M,1] = obj
+
+  inactive_set = matrix(1, rows=m, cols=1)
+  iter = 0
+  continue = TRUE
+  while(iter < maxiter & continue) {
+    dw = matrix(0, rows=m, cols=1)
+    dg = matrix(0, rows=m, cols=1)
+    relChangeObj = -1.0
+
+    inner_iter = 0
+    inner_continue = TRUE
+    inner_maxiter = 100
+    while(inner_iter < inner_maxiter & inner_continue) {
+      u = w - g/alpha
+      lambda = tau/alpha
+
+      wnew = sign(u) * (abs(u) - lambda) * ((abs(u) - lambda) > 0)
+      dw = wnew - w
+      dw2 = sum(dw*dw)
+
+      r = X %*% wnew - y
+      gnew = t(X) %*% r
+      objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))
+      obj_threshold = max(history) - 0.5*sigma*alpha*dw2
+
+      if(objnew <= obj_threshold) {
+        w = wnew
+        dg = gnew - g
+        g = gnew
+        inner_continue = FALSE
+
+        history[1:(M-1),] = history[2:M,]
+        history[M,1] = objnew
+        relChangeObj = abs(objnew - obj)/obj
+        obj = objnew
+      }
+      else
+        alpha = eta*alpha
+
+      inner_iter = inner_iter + 1
+    }
+
+    if(inner_continue)
+      print("Inner loop did not converge")
+
+    alphanew = sum(dw*dg)/sum(dw*dw)
+    alpha = max(alpha_min, min(alpha_max, alphanew))
+
+    old_inactive_set = inactive_set
+    inactive_set = w != 0
+    diff = sum(abs(old_inactive_set - inactive_set))
+
+    if(diff == 0 & relChangeObj < tol)
+      continue = FALSE
+
+    num_inactive = sum(w != 0)
+    if( verbose )
+      print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
+    iter = iter + 1
+  }
+}
diff --git a/scripts/builtin/ppca.dml b/scripts/builtin/ppca.dml
new file mode 100644
index 0000000..2683f90
--- /dev/null
+++ b/scripts/builtin/ppca.dml
@@ -0,0 +1,149 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME      TYPE    DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# X         Matrix  ---      n x m input feature matrix
+# k         Integer ---      indicates dimension of the new vector space constructed from eigen vectors
+# maxi      Integer ---      maximum number of iterations until convergence
+# tolobj    Double  0.00001  objective function tolerance value to stop ppca algorithm
+# tolrecerr Double  0.02     reconstruction error tolerance value to stop the algorithm
+# verbose   Boolen  TRUE     verbose debug output
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# Xout   Matrix  ---      Output feature matrix with K columns
+# Mout   Matrix  ---      Output dominant eigen vectors (can be used for projections)
+
+
+m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10, 
+  Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
+  return(Matrix[Double] Xout, Matrix[Double] Mout)
+{
+  n = nrow(X);
+  m = ncol(X);
+
+  #initializing principal components matrix
+  C =  rand(rows=m, cols=K, pdf="normal");
+  ss = rand(rows=1, cols=1, pdf="normal");
+  ss = as.scalar(ss);
+  ssPrev = ss;
+
+  # best selected principle components - with the lowest reconstruction error
+  PC = C;
+
+  # initilizing reconstruction error
+  RE = tolrecerr+1;
+  REBest = RE;
+
+  Z = matrix(0,rows=1,cols=1);
+
+  #Objective function value
+  ObjRelChng = tolobj+1;
+
+  # mean centered input matrix - dim -> [n,m]
+  Xm = X - colMeans(X);
+
+  #I -> k x k
+  ITMP = matrix(1,rows=K,cols=1);
+  I = diag(ITMP);
+
+  i = 0;
+  while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){
+    #Estimation step - Covariance matrix
+    #M -> k x k
+    M = t(C) %*% C + I*ss;
+
+    #Auxilary matrix with n latent variables
+    # Z -> n x k
+    Z = Xm %*% (C %*% inv(M));
+
+    #ZtZ -> k x k
+    ZtZ = t(Z) %*% Z + inv(M)*ss;
+
+    #XtZ -> m x k
+    XtZ = t(Xm) %*% Z;
+
+    #Maximization step
+    #C ->  m x k
+    ZtZ_sum = sum(ZtZ); #+n*inv(M));
+    C = XtZ/ZtZ_sum;
+
+    #ss2 -> 1 x 1
+    ss2 = trace(ZtZ * (t(C) %*% C));
+
+    #ss3 -> 1 x 1
+    ss3 = sum((Z %*% t(C)) %*% t(Xm));
+
+    #Frobenius norm of reconstruction error -> Euclidean norm
+    #Fn -> 1 x 1
+    Fn = sum(Xm*Xm);
+
+    #ss -> 1 x 1
+    ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+    #calculating objective function relative change
+    ObjRelChng = abs(1 - ss/ssPrev);
+    #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+    #Reconstruction error
+    R = ((Z %*% t(C)) -  Xm);
+
+    #calculate the error
+    #TODO rethink calculation of reconstruction error ....
+    #1-Norm of reconstruction error - a big dense matrix
+    #RE -> n x m
+    RE = abs(sum(R)/sum(Xm));
+    if (RE < REBest){
+      PC = C;
+      REBest = RE;
+    }
+    #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+    ssPrev = ss;
+    i = i+1;
+  }
+  if( verbose )
+    print("Objective Relative Change: " + ObjRelChng);
+  if( verbose )
+    print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+  # reconstructs data
+  # RD -> n x k
+  Xout = X %*% PC;
+
+  # calculate eigenvalues - principle component variance
+  RDMean = colMeans(Xout);
+  V = t(colMeans(Xout^2) - (RDMean^2));
+
+  # sorting eigenvalues and eigenvectors in decreasing order
+  V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+  VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+  Mout = PC %*% VF_decr;  # vectors (values via VF_decr %*% V)
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 92f6e48..157a411 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -120,6 +120,7 @@ public enum Builtins {
 	ISINF("is.infinite", false),
 	KMEANS("kmeans", true),
 	L2SVM("l2svm", true),
+	LASSO("lasso", true),
 	LENGTH("length", false),
 	LINEAGE("lineage", false),
 	LIST("list", false),  //note: builtin and parbuiltin
@@ -155,6 +156,7 @@ public enum Builtins {
 	OUTLIER_IQR("outlierByIQR", true),
 	PCA("pca", true),
 	PNMF("pnmf", true),
+	PPCA("ppca", true),
 	PPRED("ppred", false),
 	PROD("prod", false),
 	QR("qr", false, ReturnType.MULTI_RETURN),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java
new file mode 100644
index 0000000..db94b04
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinLassoTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BuiltinLassoTest extends AutomatedTestBase {
+	private final static String TEST_NAME = "lasso";
+	private final static String TEST_DIR = "functions/builtin/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinLassoTest.class.getSimpleName() + "/";
+
+	private final static int rows = 100;
+	private final static int cols = 10;
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+	}
+
+	@Test
+	public void testLasso() { 
+		runLassoTest();
+	}
+
+	private void runLassoTest() {
+		loadTestConfiguration(getTestConfiguration(TEST_NAME));
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		List<String> proArgs = new ArrayList<>();
+
+		proArgs.add("-args");
+		proArgs.add(input("X"));
+		proArgs.add(input("y"));
+		proArgs.add(output("w"));
+		programArgs = proArgs.toArray(new String[proArgs.size()]);
+		double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1);
+		double[][] y = getRandomMatrix(rows, 1, 0, 1, 0.8, -1);
+		writeInputMatrixWithMTD("X", X, true);
+		writeInputMatrixWithMTD("y", y, true);
+
+		runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+		MatrixCharacteristics mc = readDMLMetaDataFile("w");
+		Assert.assertEquals(cols, mc.getRows());
+		Assert.assertEquals(1, mc.getCols());
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java
new file mode 100644
index 0000000..c1b96cd
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPPCATest.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BuiltinPPCATest extends AutomatedTestBase {
+	private final static String TEST_NAME = "ppca";
+	private final static String TEST_DIR = "functions/builtin/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinPPCATest.class.getSimpleName() + "/";
+
+	private final static int rows = 200;
+	private final static int cols = 20;
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"PC","V"}));
+	}
+
+	@Test
+	public void testPpca4() {
+		runPPCATest(4);
+	}
+	
+	@Test
+	public void testPpca16() {
+		runPPCATest(16);
+	}
+
+	private void runPPCATest(int k) {
+		loadTestConfiguration(getTestConfiguration(TEST_NAME));
+		String HOME = SCRIPT_DIR + TEST_DIR;
+		fullDMLScriptName = HOME + TEST_NAME + ".dml";
+		List<String> proArgs = new ArrayList<>();
+
+		proArgs.add("-args");
+		proArgs.add(input("X"));
+		proArgs.add(String.valueOf(k));
+		proArgs.add(output("PC"));
+		proArgs.add(output("V"));
+		programArgs = proArgs.toArray(new String[proArgs.size()]);
+		double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.8, -1);
+		writeInputMatrixWithMTD("X", X, true);
+
+		runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+		MatrixCharacteristics mc = readDMLMetaDataFile("PC");
+		Assert.assertEquals(rows, mc.getRows());
+		Assert.assertEquals(k, mc.getCols());
+	}
+}
diff --git a/src/test/scripts/functions/builtin/lasso.dml b/src/test/scripts/functions/builtin/lasso.dml
new file mode 100644
index 0000000..0a2e3e9
--- /dev/null
+++ b/src/test/scripts/functions/builtin/lasso.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+y = read($2)
+w = lasso(X = X, y = y)
+write(w, $3)
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/ppca.dml b/src/test/scripts/functions/builtin/ppca.dml
new file mode 100644
index 0000000..2699ccc
--- /dev/null
+++ b/src/test/scripts/functions/builtin/ppca.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+k = $2;
+[PC, V] = ppca(X=X, K=k)
+write(PC, $3)
+write(V, $4)