You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/01/08 20:07:58 UTC
[3/4] incubator-systemml git commit: New probabilistic pca (ppca) script, still in staging; by Narine

New probabilistic pca (ppca) script, still in staging; by Narine

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/83a5b42d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/83a5b42d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/83a5b42d

Branch: refs/heads/master
Commit: 83a5b42d8d0baf61b6a4eecd5c1c18d16dada9e3
Parents: a71bb12
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Thu Jan 7 12:49:22 2016 -0800
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Thu Jan 7 12:49:22 2016 -0800

----------------------------------------------------------------------
 scripts/staging/PPCA.dml | 160 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/83a5b42d/scripts/staging/PPCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PPCA.dml b/scripts/staging/PPCA.dml
new file mode 100644
index 0000000..667c709
--- /dev/null
+++ b/scripts/staging/PPCA.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+ 
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. 
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed 
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME   	TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# X  	 	String ---      location to read the matrix X input matrix
+# k      	Int    ---      indicates dimension of the new vector space constructed from eigen vectors
+# tolobj 	Int    0.00001  objective function tolerance value to stop ppca algorithm
+# tolrecerr	Int    0.02     reconstruction error tolerance value to stop the algorithm	
+# iter   	Int    10       maximum number of iterations
+# fmt    	String 'text'   output format of results PPCA such as "text" or "csv"
+# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X  C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS: 
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# C     	Matrix  ---     principal components
+# V      	Matrix  ---     eigenvalues / eigenvalues of principal components
+#
+
+X = read($X);
+
+fileC = $C;
+fileV = $V;
+
+k = ifdef($k, ncol(X));
+iter = ifdef($iter, 10);
+tolobj = ifdef($tolobj, 0.00001);
+tolrecerr = ifdef($tolrecerr, 0.02);
+fmt0 = ifdef($fmt, "text"); 
+
+n = nrow(X);
+m = ncol(X);
+
+#initializing principal components matrix
+C =  rand(rows=m, cols=k, pdf="normal");
+ss = rand(rows=1, cols=1, pdf="normal");
+ss = as.scalar(ss);
+ssPrev = ss;
+
+# best selected principle components - with the lowest reconstruction error 
+PC = C;
+
+# initilizing reconstruction error
+RE = tolrecerr+1;
+REBest = RE;
+
+Z = matrix(0,rows=1,cols=1);
+
+#Objective function value
+ObjRelChng = tolobj+1;
+
+# mean centered input matrix - dim -> [n,m]
+Xm = X - colMeans(X);
+
+#I -> k x k
+ITMP = matrix(1,rows=k,cols=1);
+I = diag(ITMP);
+
+i = 0;
+while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
+	#Estimation step - Covariance matrix 
+	#M -> k x k
+	M = t(C) %*% C + I*ss; 
+	
+	#Auxilary matrix with n latent variables 
+	# Z -> n x k		
+	Z = Xm %*% (C %*% inv(M)); 
+
+	#ZtZ -> k x k
+	ZtZ = t(Z) %*% Z + inv(M)*ss;
+	
+	#XtZ -> m x k
+	XtZ = t(Xm) %*% Z;
+	
+	#Maximization step
+	#C ->  m x k
+	ZtZ_sum = sum(ZtZ); #+n*inv(M)); 
+	C = XtZ/ZtZ_sum;
+
+	#ss2 -> 1 x 1
+	ss2 = trace(ZtZ * (t(C) %*% C));
+
+	#ss3 -> 1 x 1 
+	ss3 = sum((Z %*% t(C)) %*% t(Xm));
+	
+	#Frobenius norm of reconstruction error -> Euclidean norm 
+	#Fn -> 1 x 1	
+	Fn = sum(Xm*Xm);
+
+	#ss -> 1 x 1
+	ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+   #calculating objective function relative change
+   ObjRelChng = abs(1 - ss/ssPrev);
+   #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+	#Reconstruction error
+	R = ((Z %*% t(C)) -  Xm);	
+
+	#calculate the error
+	#TODO rethink calculation of reconstruction error .... 
+	#1-Norm of reconstruction error - a big dense matrix 
+	#RE -> n x m
+	RE = abs(sum(R)/sum(Xm));	
+	if (RE < REBest){
+		PC = C;
+		REBest = RE;
+	}	
+	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+	ssPrev = ss;	
+	i = i+1;
+}
+print("Objective Relative Change: " + ObjRelChng);
+print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+# reconstructs data
+# RD -> n x k
+RD = X %*% PC;
+
+# calculate eigenvalues - principle component variance
+RDMean = colMeans(RD);
+V = t(colMeans(RD*RD) - (RDMean*RDMean));
+
+# sorting eigenvalues and eigenvectors in decreasing order
+V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+V = VF_decr %*% V;
+PC = PC %*% VF_decr;
+
+# writing principal components 
+write(PC, fileC, format=fmt0);
+# writing eigen values/pc variance
+write(V, fileV, format=fmt0);