You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/08/15 12:40:12 UTC

[systemds] 01/02: [SYSTEMDS-2619] New pca builtin function (principal component analysis)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 41685600ba8370646301ef0997f170c300101cea
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Aug 15 13:20:08 2020 +0200

    [SYSTEMDS-2619] New pca builtin function (principal component analysis)
---
 scripts/builtin/pca.dml                            | 64 ++++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |  1 +
 2 files changed, 65 insertions(+)

diff --git a/scripts/builtin/pca.dml b/scripts/builtin/pca.dml
new file mode 100644
index 0000000..b968162
--- /dev/null
+++ b/scripts/builtin/pca.dml
@@ -0,0 +1,64 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Principal Component Analysis (PCA) for dimensionality reduction
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE    DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# X      Matrix  ---      Input feature matrix
+# K      Int     ---      Number of reduced dimensions (i.e., columns)
+# Center Boolean TRUE     Indicates whether or not to center the feature matrix
+# Scale  Boolean TRUE     Indicates whether or not to scale the feature matrix
+# ---------------------------------------------------------------------------------------------
+# Xout   Matrix  ---      Output feature matrix with K columns
+# Mout   Matrix  ---      Output dominant eigen vectors (can be used for projections)
+# ---------------------------------------------------------------------------------------------
+
+m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
+  return (Matrix[Double] Xout, Matrix[Double] Mout) 
+{
+  N = nrow(X);
+  D = ncol(X);
+
+  # perform z-scoring (centering and scaling)
+  X = scale(X, center, scale);
+
+  # co-variance matrix
+  mu = colSums(X)/N;
+  C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+  # compute eigen vectors and values
+  [evalues, evectors] = eigen(C);
+
+  decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+  diagmat = table(seq(1,D),decreasing_Idx);
+  # sorts eigenvalues by decreasing order
+  evalues = diagmat %*% evalues;
+  # sorts eigenvectors column-wise in the order of decreasing eigenvalues
+  evectors = evectors %*% diagmat;
+
+  eval_dominant = evalues[1:K, 1];
+  evec_dominant = evectors[,1:K];
+
+  # Construct new data set by treating computed dominant eigenvectors as the basis vectors
+  Xout = X %*% evec_dominant;
+  Mout = evec_dominant;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index cc5b12b..1cd430c 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -145,6 +145,7 @@ public enum Builtins {
 	OUTLIER("outlier", true, false), //TODO parameterize opposite
 	OUTLIER_SD("outlierBySd", true),
 	OUTLIER_IQR("outlierByIQR", true),
+	PCA("pca", true),
 	PNMF("pnmf", true),
 	PPRED("ppred", false),
 	PROD("prod", false),