You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/01/08 20:07:58 UTC
[3/4] incubator-systemml git commit: New probabilistic pca (ppca)
script, still in staging; by Narine
New probabilistic pca (ppca) script, still in staging; by Narine
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/83a5b42d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/83a5b42d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/83a5b42d
Branch: refs/heads/master
Commit: 83a5b42d8d0baf61b6a4eecd5c1c18d16dada9e3
Parents: a71bb12
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Thu Jan 7 12:49:22 2016 -0800
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Thu Jan 7 12:49:22 2016 -0800
----------------------------------------------------------------------
scripts/staging/PPCA.dml | 160 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 160 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/83a5b42d/scripts/staging/PPCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PPCA.dml b/scripts/staging/PPCA.dml
new file mode 100644
index 0000000..667c709
--- /dev/null
+++ b/scripts/staging/PPCA.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- location to read the matrix X input matrix
+# k Int --- indicates dimension of the new vector space constructed from eigen vectors
+# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm
+# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm
+# iter Int 10 maximum number of iterations
+# fmt String 'text' output format of results PPCA such as "text" or "csv"
+# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# C Matrix --- principal components
+# V Matrix --- eigenvalues / eigenvalues of principal components
+#
+
+X = read($X);
+
+fileC = $C;
+fileV = $V;
+
+k = ifdef($k, ncol(X));
+iter = ifdef($iter, 10);
+tolobj = ifdef($tolobj, 0.00001);
+tolrecerr = ifdef($tolrecerr, 0.02);
+fmt0 = ifdef($fmt, "text");
+
+n = nrow(X);
+m = ncol(X);
+
+#initializing principal components matrix
+C = rand(rows=m, cols=k, pdf="normal");
+ss = rand(rows=1, cols=1, pdf="normal");
+ss = as.scalar(ss);
+ssPrev = ss;
+
+# best selected principle components - with the lowest reconstruction error
+PC = C;
+
+# initilizing reconstruction error
+RE = tolrecerr+1;
+REBest = RE;
+
+Z = matrix(0,rows=1,cols=1);
+
+#Objective function value
+ObjRelChng = tolobj+1;
+
+# mean centered input matrix - dim -> [n,m]
+Xm = X - colMeans(X);
+
+#I -> k x k
+ITMP = matrix(1,rows=k,cols=1);
+I = diag(ITMP);
+
+i = 0;
+while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
+ #Estimation step - Covariance matrix
+ #M -> k x k
+ M = t(C) %*% C + I*ss;
+
+ #Auxilary matrix with n latent variables
+ # Z -> n x k
+ Z = Xm %*% (C %*% inv(M));
+
+ #ZtZ -> k x k
+ ZtZ = t(Z) %*% Z + inv(M)*ss;
+
+ #XtZ -> m x k
+ XtZ = t(Xm) %*% Z;
+
+ #Maximization step
+ #C -> m x k
+ ZtZ_sum = sum(ZtZ); #+n*inv(M));
+ C = XtZ/ZtZ_sum;
+
+ #ss2 -> 1 x 1
+ ss2 = trace(ZtZ * (t(C) %*% C));
+
+ #ss3 -> 1 x 1
+ ss3 = sum((Z %*% t(C)) %*% t(Xm));
+
+ #Frobenius norm of reconstruction error -> Euclidean norm
+ #Fn -> 1 x 1
+ Fn = sum(Xm*Xm);
+
+ #ss -> 1 x 1
+ ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+ #calculating objective function relative change
+ ObjRelChng = abs(1 - ss/ssPrev);
+ #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+ #Reconstruction error
+ R = ((Z %*% t(C)) - Xm);
+
+ #calculate the error
+ #TODO rethink calculation of reconstruction error ....
+ #1-Norm of reconstruction error - a big dense matrix
+ #RE -> n x m
+ RE = abs(sum(R)/sum(Xm));
+ if (RE < REBest){
+ PC = C;
+ REBest = RE;
+ }
+ #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+ ssPrev = ss;
+ i = i+1;
+}
+print("Objective Relative Change: " + ObjRelChng);
+print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+# reconstructs data
+# RD -> n x k
+RD = X %*% PC;
+
+# calculate eigenvalues - principle component variance
+RDMean = colMeans(RD);
+V = t(colMeans(RD*RD) - (RDMean*RDMean));
+
+# sorting eigenvalues and eigenvectors in decreasing order
+V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+V = VF_decr %*% V;
+PC = PC %*% VF_decr;
+
+# writing principal components
+write(PC, fileC, format=fmt0);
+# writing eigen values/pc variance
+write(V, fileV, format=fmt0);