You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2017/09/22 17:53:37 UTC
systemml git commit: [SYSTEMML-493] Functionalize PCA
Repository: systemml
Updated Branches:
refs/heads/master 900d8c926 -> 317f2189c
[SYSTEMML-493] Functionalize PCA
Changes include wrapping PCA computation to a function call.
Improve docs and add default values in genRandData4PCA.dml.
Closes #653.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/317f2189
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/317f2189
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/317f2189
Branch: refs/heads/master
Commit: 317f2189c585e2dd37d847f5e2e32d97f8b31a60
Parents: 900d8c9
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Fri Sep 22 10:51:44 2017 -0700
Committer: Deron Eriksson <de...@apache.org>
Committed: Fri Sep 22 10:51:44 2017 -0700
----------------------------------------------------------------------
scripts/datagen/genRandData4PCA.dml | 43 ++++++------
scripts/staging/PCA.dml | 117 +++++++++++++++++++++++++++++++
2 files changed, 138 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/datagen/genRandData4PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4PCA.dml b/scripts/datagen/genRandData4PCA.dml
index 382a362..058facb 100644
--- a/scripts/datagen/genRandData4PCA.dml
+++ b/scripts/datagen/genRandData4PCA.dml
@@ -19,28 +19,27 @@
#
#-------------------------------------------------------------
-/*
-Synthetic data generator for PCA.
--> 3 hidden dimensions (V1, V2, V3)
--> generates only "dense" data
-
----------------------------------
- Parameters
----------------------------------
-$R = #rows
-$C = #columns
-$OUT = output file path on HDFS
-$FMT = output format
----------------------------------
-hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 DATA=/user/biuser/pcaData.mtx FMT=csv
----------------------------------
-*/
-
-FMT = ifdef($FMT,"binary"); # default output format
-
-# number of categorical attributes.. numC <= C
-R = $R;
-C = $C;
+#
+# Synthetic data generator for PCA
+# 3 hidden dimensions (V1, V2, V3)
+# generates only "dense" data
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# --------------------------------------------------------------------------------------------
+# R Int 10000 Number of rows
+# C Int 1000 Number of categorical attributes
+# OUT String --- Location (on HDFS) to store the generated dataset
+# FMT String "csv" Matrix output format, usually "text", "csv" or "binary"
+# --------------------------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 OUT=/user/biuser/pcaData.mtx FMT=csv
+
+R = ifdef ($R, 10000)
+C = ifdef ($C, 1000)
+FMT = ifdef ($FMT, "csv");
# Modofied version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006.
http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/staging/PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PCA.dml b/scripts/staging/PCA.dml
new file mode 100644
index 0000000..d73c2e4
--- /dev/null
+++ b/scripts/staging/PCA.dml
@@ -0,0 +1,117 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# This script performs Principal Component Analysis (PCA) on the given input data.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# INPUT String --- Location to read the matrix A of feature vectors
+# K Int --- Indicates dimension of the new vector space constructed from eigen vectors
+# CENTER Int 0 Indicates whether or not to center data
+# SCALE Int 0 Indicates whether or not to scale data
+# OFMT String --- Output data format
+# PROJDATA Int 0 This argument indicates if the data should be projected or not
+# MODEL String --- Location to already existing model: eigenvectors and eigenvalues
+# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors,
+# and data projected onto new basis vectors)
+# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000
+# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
+# ---------------------------------------------------------------------------------------------
+
+A = read($INPUT);
+K = ifdef($K, ncol(A));
+ofmt = ifdef($OFMT, "CSV");
+projectData = ifdef($PROJDATA,0);
+model = ifdef($MODEL,"");
+center = ifdef($CENTER,0);
+scale = ifdef($SCALE,0);
+output = ifdef($OUTPUT,"/");
+
+# reuse existing model to project data
+if (model != "") {
+ evec_dominant = read(model+"/dominant.eigen.vectors");
+ }else{
+ model = output;
+}
+
+PCA = function(matrix[double] A, integer K, string ofmt, integer projectData, string model, integer center, integer scale, string output)
+ return(matrix[double] eval_dominant, matrix[double] evec_dominant) {
+
+ evec_dominant = matrix(0,cols=1,rows=1);
+
+ N = nrow(A);
+ D = ncol(A);
+
+ # perform z-scoring (centering and scaling)
+ if (center == 1) {
+ cm = colMeans(A);
+ A = A - cm;
+ }
+ if (scale == 1) {
+ cvars = (colSums (A^2));
+ if (center == 1){
+ cm = colMeans(A);
+ cvars = (cvars - N*(cm^2))/(N-1);
+ }
+ Azscored = (A)/sqrt(cvars);
+ A = Azscored;
+ }
+
+ # co-variance matrix
+ mu = colSums(A)/N;
+ C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+
+ # compute eigen vectors and values
+ [evalues, evectors] = eigen(C);
+
+ decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+ diagmat = table(seq(1,D),decreasing_Idx);
+ # sorts eigenvalues by decreasing order
+ evalues = diagmat %*% evalues;
+ # sorts eigenvectors column-wise in the order of decreasing eigenvalues
+ evectors = evectors %*% diagmat;
+
+
+ # select K dominant eigen vectors
+ nvec = ncol(evectors);
+
+ eval_dominant = evalues[1:K, 1];
+ evec_dominant = evectors[,1:K];
+
+}
+
+[eval_dominant, evec_dominant] = PCA(A, K, ofmt, projectData, model, center, scale, output)
+
+# the square root of eigenvalues
+eval_stdev_dominant = sqrt(eval_dominant);
+write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
+write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
+write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
+
+# Construct new data set by treating computed dominant eigenvectors as the basis vectors
+if (projectData == 1 | model != ""){
+ newA = A %*% evec_dominant;
+ write(newA, output+"/projected.data", format=ofmt);
+}