You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2017/09/22 17:53:37 UTC
systemml git commit: [SYSTEMML-493] Functionalize PCA

Repository: systemml
Updated Branches:
  refs/heads/master 900d8c926 -> 317f2189c


[SYSTEMML-493] Functionalize PCA

Changes include wrapping PCA computation to a function call.
Improve docs and add default values in genRandData4PCA.dml.

Closes #653.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/317f2189
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/317f2189
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/317f2189

Branch: refs/heads/master
Commit: 317f2189c585e2dd37d847f5e2e32d97f8b31a60
Parents: 900d8c9
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Fri Sep 22 10:51:44 2017 -0700
Committer: Deron Eriksson <de...@apache.org>
Committed: Fri Sep 22 10:51:44 2017 -0700

----------------------------------------------------------------------
 scripts/datagen/genRandData4PCA.dml |  43 ++++++------
 scripts/staging/PCA.dml             | 117 +++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/datagen/genRandData4PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4PCA.dml b/scripts/datagen/genRandData4PCA.dml
index 382a362..058facb 100644
--- a/scripts/datagen/genRandData4PCA.dml
+++ b/scripts/datagen/genRandData4PCA.dml
@@ -19,28 +19,27 @@
 #
 #-------------------------------------------------------------
 
-/*
-Synthetic data generator for PCA.
--> 3 hidden dimensions (V1, V2, V3)
--> generates only "dense" data
-
----------------------------------
-  Parameters                     
----------------------------------
-$R    = #rows
-$C    = #columns
-$OUT  = output file path on HDFS
-$FMT  = output format
----------------------------------
-hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 DATA=/user/biuser/pcaData.mtx FMT=csv
----------------------------------
-*/
-
-FMT = ifdef($FMT,"binary"); # default output format
-
-# number of categorical attributes.. numC <= C
-R = $R;
-C = $C;
+#
+# Synthetic data generator for PCA
+# 3 hidden dimensions (V1, V2, V3)
+# generates only "dense" data
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# --------------------------------------------------------------------------------------------
+# R      Int     10000   Number of rows
+# C      Int     1000    Number of categorical attributes
+# OUT    String  ---     Location (on HDFS) to store the generated dataset
+# FMT    String  "csv"   Matrix output format, usually "text", "csv" or "binary"
+# --------------------------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemML.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 OUT=/user/biuser/pcaData.mtx FMT=csv
+
+R =   ifdef ($R, 10000)
+C   = ifdef ($C, 1000)
+FMT = ifdef ($FMT, "csv");
 
 # Modofied version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006.
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/317f2189/scripts/staging/PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PCA.dml b/scripts/staging/PCA.dml
new file mode 100644
index 0000000..d73c2e4
--- /dev/null
+++ b/scripts/staging/PCA.dml
@@ -0,0 +1,117 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# This script performs Principal Component Analysis (PCA) on the given input data.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# INPUT  String ---      Location to read the matrix A of feature vectors
+# K      Int    ---      Indicates dimension of the new vector space constructed from eigen vectors
+# CENTER Int    0        Indicates whether or not to center data
+# SCALE  Int    0        Indicates whether or not to scale data
+# OFMT   String ---      Output data format
+# PROJDATA Int  0	     This argument indicates if the data should be projected or not
+# MODEL  String ---      Location to already existing model: eigenvectors and eigenvalues
+# OUTPUT String /        Location to write output matrices (covariance matrix, new basis vectors,
+#                           and data projected onto new basis vectors)
+# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000
+# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
+# ---------------------------------------------------------------------------------------------
+
+A = read($INPUT);
+K = ifdef($K, ncol(A));
+ofmt = ifdef($OFMT, "CSV");
+projectData = ifdef($PROJDATA,0);
+model = ifdef($MODEL,"");
+center = ifdef($CENTER,0);
+scale = ifdef($SCALE,0);
+output = ifdef($OUTPUT,"/");
+
+# reuse existing model to project data
+if (model != "") {
+    evec_dominant = read(model+"/dominant.eigen.vectors");
+    }else{
+	model = output;
+}
+
+PCA = function(matrix[double] A, integer K, string ofmt, integer projectData, string model, integer center, integer scale, string output)
+    return(matrix[double] eval_dominant, matrix[double] evec_dominant) {
+
+    evec_dominant = matrix(0,cols=1,rows=1);
+
+    N = nrow(A);
+    D = ncol(A);
+
+    # perform z-scoring (centering and scaling)
+    if (center == 1) {
+        cm = colMeans(A);
+        A = A - cm;
+    }
+    if (scale == 1) {
+        cvars = (colSums (A^2));
+        if (center == 1){
+        cm = colMeans(A);
+            cvars = (cvars - N*(cm^2))/(N-1);
+        }
+        Azscored = (A)/sqrt(cvars);
+            A = Azscored;
+    }
+
+    # co-variance matrix
+    mu = colSums(A)/N;
+    C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+
+    # compute eigen vectors and values
+    [evalues, evectors] = eigen(C);
+
+    decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+    diagmat = table(seq(1,D),decreasing_Idx);
+    # sorts eigenvalues by decreasing order
+    evalues = diagmat %*% evalues;
+    # sorts eigenvectors column-wise in the order of decreasing eigenvalues
+    evectors = evectors %*% diagmat;
+
+
+    # select K dominant eigen vectors
+    nvec = ncol(evectors);
+
+    eval_dominant = evalues[1:K, 1];
+    evec_dominant = evectors[,1:K];
+
+}
+
+[eval_dominant, evec_dominant] = PCA(A, K, ofmt, projectData, model, center, scale, output)
+
+# the square root of eigenvalues
+eval_stdev_dominant = sqrt(eval_dominant);
+write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
+write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
+write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
+
+# Construct new data set by treating computed dominant eigenvectors as the basis vectors
+if (projectData == 1 | model != ""){
+	newA = A %*% evec_dominant;
+	write(newA, output+"/projected.data", format=ofmt);
+}