You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:21 UTC
[45/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/MultiLogReg.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/MultiLogReg.dml b/scripts/algorithms/MultiLogReg.dml
index 0b18b9d..716678d 100644
--- a/scripts/algorithms/MultiLogReg.dml
+++ b/scripts/algorithms/MultiLogReg.dml
@@ -1,365 +1,365 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Solves Multinomial Logistic Regression using Trust Region methods.
-# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
-
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME  TYPE   DEFAULT  MEANING
-# --------------------------------------------------------------------------------------------
-# X     String  ---     Location to read the matrix of feature vectors
-# Y     String  ---     Location to read the matrix with category labels
-# B     String  ---     Location to store estimated regression parameters (the betas)
-# Log   String  " "     Location to write per-iteration variables for log/debugging purposes
-# icpt  Int      0      Intercept presence, shifting and rescaling X columns:
-#                       0 = no intercept, no shifting, no rescaling;
-#                       1 = add intercept, but neither shift nor rescale X;
-#                       2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# reg   Double  0.0     regularization parameter (lambda = 1/C); intercept is not regularized
-# tol   Double 0.000001 tolerance ("epsilon")
-# moi   Int     100     max. number of outer (Newton) iterations
-# mii   Int      0      max. number of inner (conjugate gradient) iterations, 0 = no max
-# fmt   String "text"   Matrix output format, usually "text" or "csv" (for matrices only)
-# --------------------------------------------------------------------------------------------
-# The largest label represents the baseline category; if label -1 or 0 is present, then it is
-# the baseline label (and it is converted to the largest label).
-#
-# The Log file, when requested, contains the following per-iteration variables in CSV format,
-# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
-#
-# NAME                  MEANING
-# -------------------------------------------------------------------------------------------
-# LINEAR_TERM_MIN       The minimum value of X %*% B, used to check for overflows
-# LINEAR_TERM_MAX       The maximum value of X %*% B, used to check for overflows
-# NUM_CG_ITERS          Number of inner (Conj.Gradient) iterations in this outer iteration
-# IS_TRUST_REACHED      1 = trust region boundary was reached, 0 = otherwise
-# POINT_STEP_NORM       L2-norm of iteration step from old point (i.e. matrix B) to new point
-# OBJECTIVE             The loss function we minimize (negative regularized log-likelihood)
-# OBJ_DROP_REAL         Reduction in the objective during this iteration, actual value
-# OBJ_DROP_PRED         Reduction in the objective predicted by a quadratic approximation
-# OBJ_DROP_RATIO        Actual-to-predicted reduction ratio, used to update the trust region
-# IS_POINT_UPDATED      1 = new point accepted; 0 = new point rejected, old point restored
-# GRADIENT_NORM         L2-norm of the loss function gradient (omitted if point is rejected)
-# TRUST_DELTA           Updated trust region size, the "delta"
-# -------------------------------------------------------------------------------------------
-#
-# Script invocation example:
-# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
-#     X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileLog = ifdef ($Log, " ");
-fmtB = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0); # $icpt = 0;
-regularization = ifdef ($reg, 0.0);  # $reg  = 0.0;
-tol = ifdef ($tol, 0.000001);        # $tol  = 0.000001;
-maxiter = ifdef ($moi, 100);         # $moi  = 100;
-maxinneriter = ifdef ($mii, 0);      # $mii  = 0;
-
-tol = as.double (tol); 
-
-print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
-print ("Reading X...");
-X = read (fileX);
-print ("Reading Y...");
-Y_vec = read (fileY);
-
-eta0 = 0.0001;
-eta1 = 0.25;
-eta2 = 0.75;
-sigma1 = 0.25;
-sigma2 = 0.5;
-sigma3 = 4.0;
-psi = 0.1;
-
-N = nrow (X);
-D = ncol (X);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
-{
-    X = append (X, matrix (1, rows = N, cols = 1));
-    D = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = D, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
-    scale_lambda [D, 1] = 0;
-}
-
-if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
-{                           # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
-    avg_X_cols = t(colSums(X)) / N;
-    var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
-    is_unsafe = ppred (var_X_cols, 0.0, "<=");
-    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-    scale_X [D, 1] = 1;
-    shift_X = - avg_X_cols * scale_X;
-    shift_X [D, 1] = 0;
-    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
-} else {
-    scale_X = matrix (1, rows = D, cols = 1);
-    shift_X = matrix (0, rows = D, cols = 1);
-    rowSums_X_sq = rowSums (X ^ 2);
-}
-
-# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
-# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
-# The transform is then associatively applied to the other side of the expression,
-# and is rewritten via "scale_X" and "shift_X" as follows:
-#
-# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
-# ssX_A  = diag (scale_X) %*% A;
-# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
-
-# Convert "Y_vec" into indicator matrice:
-if (min (Y_vec) <= 0) { 
-    # Category labels "0", "-1" etc. are converted into the largest label
-    max_y = max (Y_vec);
-    Y_vec  = Y_vec  + (- Y_vec  + max_y + 1) * ppred (Y_vec , 0.0, "<=");
-}
-Y = table (seq (1, N, 1), Y_vec);
-K = ncol (Y) - 1;   # The number of  non-baseline categories
-
-lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
-delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
-
-B = matrix (0, rows = D, cols = K);     ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
-                                        ### LT = append (LT, matrix (0, rows = N, cols = 1));
-                                        ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
-P = matrix (1, rows = N, cols = K+1);   ### exp_LT = exp (LT);
-P = P / (K + 1);                        ### P =  exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
-obj = N * log (K + 1);                  ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
-
-Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
-if (intercept_status == 2) {
-    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
-}
-Grad = Grad + lambda * B;
-norm_Grad = sqrt (sum (Grad ^ 2));
-norm_Grad_initial = norm_Grad;
-
-if (maxinneriter == 0) {
-    maxinneriter = D * K;
-}
-iter = 1;
-
-# boolean for convergence check
-converge = (norm_Grad < tol) | (iter > maxiter);
-
-print ("-- Initially:  Objective = " + obj + ",  Gradient Norm = " + norm_Grad + ",  Trust Delta = " + delta);
-
-if (fileLog != " ") {
-    log_str = "OBJECTIVE,0," + obj;
-    log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
-    log_str = append (log_str, "TRUST_DELTA,0," + delta);
-} else {
-    log_str = " ";
-}
-
-while (! converge)
-{
-	# SOLVE TRUST REGION SUB-PROBLEM
-	S = matrix (0, rows = D, cols = K);
-	R = - Grad;
-	V = R;
-	delta2 = delta ^ 2;
-	inneriter = 1;
-	norm_R2 = sum (R ^ 2);
-	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
-	is_trust_boundary_reached = 0;
-
-	while (! innerconverge)
-	{
-	    if (intercept_status == 2) {
-	        ssX_V = diag (scale_X) %*% V;
-	        ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
-	    } else {
-	        ssX_V = V;
-	    }
-        Q = P [, 1:K] * (X %*% ssX_V);
-        HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
-        if (intercept_status == 2) {
-            HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
-        }
-        HV = HV + lambda * V;
-		alpha = norm_R2 / sum (V * HV);
-		Snew = S + alpha * V;
-		norm_Snew2 = sum (Snew ^ 2);
-		if (norm_Snew2 <= delta2)
-		{
-			S = Snew;
-			R = R - alpha * HV;
-			old_norm_R2 = norm_R2 
-			norm_R2 = sum (R ^ 2);
-			V = R + (norm_R2 / old_norm_R2) * V;
-			innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
-		} else {
-	        is_trust_boundary_reached = 1;
-			sv = sum (S * V);
-			v2 = sum (V ^ 2);
-			s2 = sum (S ^ 2);
-			rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
-			if (sv >= 0) {
-				alpha = (delta2 - s2) / (sv + rad);
-			} else {
-				alpha = (rad - sv) / v2;
-			}
-			S = S + alpha * V;
-			R = R - alpha * HV;
-			innerconverge = TRUE;
-		}
-	    inneriter = inneriter + 1;
-	    innerconverge = innerconverge | (inneriter > maxinneriter);
-	}  
-	
-	# END TRUST REGION SUB-PROBLEM
-	
-	# compute rho, update B, obtain delta
-	gs = sum (S * Grad);
-	qk = - 0.5 * (gs - sum (S * R));
-	B_new = B + S;
-	if (intercept_status == 2) {
-	    ssX_B_new = diag (scale_X) %*% B_new;
-	    ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
-    } else {
-        ssX_B_new = B_new;
-    }
-    
-    LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
-    if (fileLog != " ") {
-        log_str = append (log_str, "LINEAR_TERM_MIN,"  + iter + "," + min (LT));
-        log_str = append (log_str, "LINEAR_TERM_MAX,"  + iter + "," + max (LT));
-    }
-    LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
-    exp_LT = exp (LT);
-    P_new  = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
-    obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
-    	
-	# Consider updating LT in the inner loop
-	# Consider the big "obj" and "obj_new" rounding-off their small difference below:
-
-	actred = (obj - obj_new);
-	
-	rho = actred / qk;
-	is_rho_accepted = (rho > eta0);
-	snorm = sqrt (sum (S ^ 2));
-
-    if (fileLog != " ") {
-        log_str = append (log_str, "NUM_CG_ITERS,"     + iter + "," + (inneriter - 1));
-        log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
-        log_str = append (log_str, "POINT_STEP_NORM,"  + iter + "," + snorm);
-        log_str = append (log_str, "OBJECTIVE,"        + iter + "," + obj_new);
-        log_str = append (log_str, "OBJ_DROP_REAL,"    + iter + "," + actred);
-        log_str = append (log_str, "OBJ_DROP_PRED,"    + iter + "," + qk);
-        log_str = append (log_str, "OBJ_DROP_RATIO,"   + iter + "," + rho);
-    }
-
-	if (iter == 1) {
-	   delta = min (delta, snorm);
-	}
-
-	alpha2 = obj_new - obj - gs;
-	if (alpha2 <= 0) {
-	   alpha = sigma3;
-	} 
-	else {
-	   alpha = max (sigma1, -0.5 * gs / alpha2);
-	}
-	
-	if (rho < eta0) {
-		delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
-	}
-	else {
-		if (rho < eta1) {
-			delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
-		}
-		else { 
-			if (rho < eta2) {
-				delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
-			}
-			else {
-				delta = max (delta, min (alpha * snorm, sigma3 * delta));
-			}
-		}
-	} 
-	
-	if (is_trust_boundary_reached == 1)
-	{
-	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
-	} else {
-	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
-	}
-	print ("   -- Obj.Reduction:  Actual = " + actred + ",  Predicted = " + qk + 
-	       "  (A/P: " + (round (10000.0 * rho) / 10000.0) + "),  Trust Delta = " + delta);
-	       
-	if (is_rho_accepted)
-	{
-		B = B_new;
-		P = P_new;
-		Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
-		if (intercept_status == 2) {
-		    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
-		}
-		Grad = Grad + lambda * B;
-		norm_Grad = sqrt (sum (Grad ^ 2));
-		obj = obj_new;
-	    print ("   -- New Objective = " + obj + ",  Beta Change Norm = " + snorm + ",  Gradient Norm = " + norm_Grad);
-        if (fileLog != " ") {
-            log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
-            log_str = append (log_str, "GRADIENT_NORM,"    + iter + "," + norm_Grad);
-        }
-	} else {
-        if (fileLog != " ") {
-            log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
-        }
-    }	
-	
-    if (fileLog != " ") {
-        log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
-    }	
-	
-	iter = iter + 1;
-	converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
-	    ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
-    if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
-} 
-
-if (intercept_status == 2) {
-    B_out = diag (scale_X) %*% B;
-    B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
-} else {
-    B_out = B;
-}
-write (B_out, fileB, format=fmtB);
-
-if (fileLog != " ") {
-    write (log_str, fileLog);
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Solves Multinomial Logistic Regression using Trust Region methods.
+# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
+
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# --------------------------------------------------------------------------------------------
+# X     String  ---     Location to read the matrix of feature vectors
+# Y     String  ---     Location to read the matrix with category labels
+# B     String  ---     Location to store estimated regression parameters (the betas)
+# Log   String  " "     Location to write per-iteration variables for log/debugging purposes
+# icpt  Int      0      Intercept presence, shifting and rescaling X columns:
+#                       0 = no intercept, no shifting, no rescaling;
+#                       1 = add intercept, but neither shift nor rescale X;
+#                       2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg   Double  0.0     regularization parameter (lambda = 1/C); intercept is not regularized
+# tol   Double 0.000001 tolerance ("epsilon")
+# moi   Int     100     max. number of outer (Newton) iterations
+# mii   Int      0      max. number of inner (conjugate gradient) iterations, 0 = no max
+# fmt   String "text"   Matrix output format, usually "text" or "csv" (for matrices only)
+# --------------------------------------------------------------------------------------------
+# The largest label represents the baseline category; if label -1 or 0 is present, then it is
+# the baseline label (and it is converted to the largest label).
+#
+# The Log file, when requested, contains the following per-iteration variables in CSV format,
+# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
+#
+# NAME                  MEANING
+# -------------------------------------------------------------------------------------------
+# LINEAR_TERM_MIN       The minimum value of X %*% B, used to check for overflows
+# LINEAR_TERM_MAX       The maximum value of X %*% B, used to check for overflows
+# NUM_CG_ITERS          Number of inner (Conj.Gradient) iterations in this outer iteration
+# IS_TRUST_REACHED      1 = trust region boundary was reached, 0 = otherwise
+# POINT_STEP_NORM       L2-norm of iteration step from old point (i.e. matrix B) to new point
+# OBJECTIVE             The loss function we minimize (negative regularized log-likelihood)
+# OBJ_DROP_REAL         Reduction in the objective during this iteration, actual value
+# OBJ_DROP_PRED         Reduction in the objective predicted by a quadratic approximation
+# OBJ_DROP_RATIO        Actual-to-predicted reduction ratio, used to update the trust region
+# IS_POINT_UPDATED      1 = new point accepted; 0 = new point rejected, old point restored
+# GRADIENT_NORM         L2-norm of the loss function gradient (omitted if point is rejected)
+# TRUST_DELTA           Updated trust region size, the "delta"
+# -------------------------------------------------------------------------------------------
+#
+# Script invocation example:
+# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
+#     X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileLog = ifdef ($Log, " ");
+fmtB = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0); # $icpt = 0;
+regularization = ifdef ($reg, 0.0);  # $reg  = 0.0;
+tol = ifdef ($tol, 0.000001);        # $tol  = 0.000001;
+maxiter = ifdef ($moi, 100);         # $moi  = 100;
+maxinneriter = ifdef ($mii, 0);      # $mii  = 0;
+
+tol = as.double (tol); 
+
+print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
+print ("Reading X...");
+X = read (fileX);
+print ("Reading Y...");
+Y_vec = read (fileY);
+
+eta0 = 0.0001;
+eta1 = 0.25;
+eta2 = 0.75;
+sigma1 = 0.25;
+sigma2 = 0.5;
+sigma3 = 4.0;
+psi = 0.1;
+
+N = nrow (X);
+D = ncol (X);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = append (X, matrix (1, rows = N, cols = 1));
+    D = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = D, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [D, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
+    avg_X_cols = t(colSums(X)) / N;
+    var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
+    is_unsafe = ppred (var_X_cols, 0.0, "<=");
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [D, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [D, 1] = 0;
+    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+    scale_X = matrix (1, rows = D, cols = 1);
+    shift_X = matrix (0, rows = D, cols = 1);
+    rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
+
+# Convert "Y_vec" into indicator matrice:
+if (min (Y_vec) <= 0) { 
+    # Category labels "0", "-1" etc. are converted into the largest label
+    max_y = max (Y_vec);
+    Y_vec  = Y_vec  + (- Y_vec  + max_y + 1) * ppred (Y_vec , 0.0, "<=");
+}
+Y = table (seq (1, N, 1), Y_vec);
+K = ncol (Y) - 1;   # The number of  non-baseline categories
+
+lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
+
+B = matrix (0, rows = D, cols = K);     ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
+                                        ### LT = append (LT, matrix (0, rows = N, cols = 1));
+                                        ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+P = matrix (1, rows = N, cols = K+1);   ### exp_LT = exp (LT);
+P = P / (K + 1);                        ### P =  exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+obj = N * log (K + 1);                  ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+
+Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+if (intercept_status == 2) {
+    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+}
+Grad = Grad + lambda * B;
+norm_Grad = sqrt (sum (Grad ^ 2));
+norm_Grad_initial = norm_Grad;
+
+if (maxinneriter == 0) {
+    maxinneriter = D * K;
+}
+iter = 1;
+
+# boolean for convergence check
+converge = (norm_Grad < tol) | (iter > maxiter);
+
+print ("-- Initially:  Objective = " + obj + ",  Gradient Norm = " + norm_Grad + ",  Trust Delta = " + delta);
+
+if (fileLog != " ") {
+    log_str = "OBJECTIVE,0," + obj;
+    log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
+    log_str = append (log_str, "TRUST_DELTA,0," + delta);
+} else {
+    log_str = " ";
+}
+
+while (! converge)
+{
+	# SOLVE TRUST REGION SUB-PROBLEM
+	S = matrix (0, rows = D, cols = K);
+	R = - Grad;
+	V = R;
+	delta2 = delta ^ 2;
+	inneriter = 1;
+	norm_R2 = sum (R ^ 2);
+	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+	is_trust_boundary_reached = 0;
+
+	while (! innerconverge)
+	{
+	    if (intercept_status == 2) {
+	        ssX_V = diag (scale_X) %*% V;
+	        ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
+	    } else {
+	        ssX_V = V;
+	    }
+        Q = P [, 1:K] * (X %*% ssX_V);
+        HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
+        if (intercept_status == 2) {
+            HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
+        }
+        HV = HV + lambda * V;
+		alpha = norm_R2 / sum (V * HV);
+		Snew = S + alpha * V;
+		norm_Snew2 = sum (Snew ^ 2);
+		if (norm_Snew2 <= delta2)
+		{
+			S = Snew;
+			R = R - alpha * HV;
+			old_norm_R2 = norm_R2 
+			norm_R2 = sum (R ^ 2);
+			V = R + (norm_R2 / old_norm_R2) * V;
+			innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+		} else {
+	        is_trust_boundary_reached = 1;
+			sv = sum (S * V);
+			v2 = sum (V ^ 2);
+			s2 = sum (S ^ 2);
+			rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
+			if (sv >= 0) {
+				alpha = (delta2 - s2) / (sv + rad);
+			} else {
+				alpha = (rad - sv) / v2;
+			}
+			S = S + alpha * V;
+			R = R - alpha * HV;
+			innerconverge = TRUE;
+		}
+	    inneriter = inneriter + 1;
+	    innerconverge = innerconverge | (inneriter > maxinneriter);
+	}  
+	
+	# END TRUST REGION SUB-PROBLEM
+	
+	# compute rho, update B, obtain delta
+	gs = sum (S * Grad);
+	qk = - 0.5 * (gs - sum (S * R));
+	B_new = B + S;
+	if (intercept_status == 2) {
+	    ssX_B_new = diag (scale_X) %*% B_new;
+	    ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
+    } else {
+        ssX_B_new = B_new;
+    }
+    
+    LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
+    if (fileLog != " ") {
+        log_str = append (log_str, "LINEAR_TERM_MIN,"  + iter + "," + min (LT));
+        log_str = append (log_str, "LINEAR_TERM_MAX,"  + iter + "," + max (LT));
+    }
+    LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+    exp_LT = exp (LT);
+    P_new  = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+    obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+    	
+	# Consider updating LT in the inner loop
+	# Consider the big "obj" and "obj_new" rounding-off their small difference below:
+
+	actred = (obj - obj_new);
+	
+	rho = actred / qk;
+	is_rho_accepted = (rho > eta0);
+	snorm = sqrt (sum (S ^ 2));
+
+    if (fileLog != " ") {
+        log_str = append (log_str, "NUM_CG_ITERS,"     + iter + "," + (inneriter - 1));
+        log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
+        log_str = append (log_str, "POINT_STEP_NORM,"  + iter + "," + snorm);
+        log_str = append (log_str, "OBJECTIVE,"        + iter + "," + obj_new);
+        log_str = append (log_str, "OBJ_DROP_REAL,"    + iter + "," + actred);
+        log_str = append (log_str, "OBJ_DROP_PRED,"    + iter + "," + qk);
+        log_str = append (log_str, "OBJ_DROP_RATIO,"   + iter + "," + rho);
+    }
+
+	if (iter == 1) {
+	   delta = min (delta, snorm);
+	}
+
+	alpha2 = obj_new - obj - gs;
+	if (alpha2 <= 0) {
+	   alpha = sigma3;
+	} 
+	else {
+	   alpha = max (sigma1, -0.5 * gs / alpha2);
+	}
+	
+	if (rho < eta0) {
+		delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
+	}
+	else {
+		if (rho < eta1) {
+			delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
+		}
+		else { 
+			if (rho < eta2) {
+				delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
+			}
+			else {
+				delta = max (delta, min (alpha * snorm, sigma3 * delta));
+			}
+		}
+	} 
+	
+	if (is_trust_boundary_reached == 1)
+	{
+	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
+	} else {
+	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
+	}
+	print ("   -- Obj.Reduction:  Actual = " + actred + ",  Predicted = " + qk + 
+	       "  (A/P: " + (round (10000.0 * rho) / 10000.0) + "),  Trust Delta = " + delta);
+	       
+	if (is_rho_accepted)
+	{
+		B = B_new;
+		P = P_new;
+		Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+		if (intercept_status == 2) {
+		    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+		}
+		Grad = Grad + lambda * B;
+		norm_Grad = sqrt (sum (Grad ^ 2));
+		obj = obj_new;
+	    print ("   -- New Objective = " + obj + ",  Beta Change Norm = " + snorm + ",  Gradient Norm = " + norm_Grad);
+        if (fileLog != " ") {
+            log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
+            log_str = append (log_str, "GRADIENT_NORM,"    + iter + "," + norm_Grad);
+        }
+	} else {
+        if (fileLog != " ") {
+            log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
+        }
+    }	
+	
+    if (fileLog != " ") {
+        log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
+    }	
+	
+	iter = iter + 1;
+	converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
+	    ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
+    if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
+} 
+
+if (intercept_status == 2) {
+    B_out = diag (scale_X) %*% B;
+    B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
+} else {
+    B_out = B;
+}
+write (B_out, fileB, format=fmtB);
+
+if (fileLog != " ") {
+    write (log_str, fileLog);
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml
index e8b4aec..65800b7 100644
--- a/scripts/algorithms/PCA.dml
+++ b/scripts/algorithms/PCA.dml
@@ -1,112 +1,112 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# 
-# This script performs Principal Component Analysis (PCA) on the given input data.
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME   TYPE   DEFAULT  MEANING
-# ---------------------------------------------------------------------------------------------
-# INPUT  String ---      Location to read the matrix A of feature vectors
-# K      Int    ---      Indicates dimension of the new vector space constructed from eigen vectors
-# CENTER Int    0        Indicates whether or not to center data 
-# SCALE  Int    0        Indicates whether or not to scale data 
-# OFMT   String ---      Output data format
-# PROJDATA Int  0	     This argument indicates if the data should be projected or not
-# MODEL  String ---      Location to already existing model: eigenvectors and eigenvalues 
-# OUTPUT String /        Location to write output matrices (covariance matrix, new basis vectors, 
-#                           and data projected onto new basis vectors)
-# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 
-# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
-# ---------------------------------------------------------------------------------------------
-
-A = read($INPUT);
-K = ifdef($K, ncol(A));
-ofmt = ifdef($OFMT, "CSV");
-projectData = ifdef($PROJDATA,0);
-model = ifdef($MODEL,"");
-center = ifdef($CENTER,0);
-scale = ifdef($SCALE,0);
-output = ifdef($OUTPUT,"/");
-
-evec_dominant = matrix(0,cols=1,rows=1);
-
-if (model != "") {
-	# reuse existing model to project data
-    evec_dominant = read(model+"/dominant.eigen.vectors");
-}else{
-	if (model == "" ){
-		model = output;	
-	}	
-
-	N = nrow(A);
-	D = ncol(A);
-
-	# perform z-scoring (centering and scaling)
-	if (center == 1) {
-	    cm = colMeans(A);
-	    A = A - cm;
-	}
-	if (scale == 1) {
-	    cvars = (colSums (A^2));	
-	    if (center == 1){
-		cm = colMeans(A);
-	    	cvars = (cvars - N*(cm^2))/(N-1);		    
- 	    }
-	    Azscored = (A)/sqrt(cvars);
-            A = Azscored;
-	}	
-
-	# co-variance matrix 
-	mu = colSums(A)/N;
-	C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
-
-
-	# compute eigen vectors and values
-	[evalues, evectors] = eigen(C);
-
-	decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
-	diagmat = table(seq(1,D),decreasing_Idx);
-	# sorts eigenvalues by decreasing order
-	evalues = diagmat %*% evalues;
-	# sorts eigenvectors column-wise in the order of decreasing eigenvalues
-	evectors = evectors %*% diagmat;
-
-
-	# select K dominant eigen vectors 
-	nvec = ncol(evectors);
-
-	eval_dominant = evalues[1:K, 1];
-	evec_dominant = evectors[,1:K];
-	
-	# the square root of eigenvalues
-	eval_stdev_dominant = sqrt(eval_dominant);
-	
-	write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
-	write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
-	write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
-}
-if (projectData == 1 | model != ""){
-	# Construct new data set by treating computed dominant eigenvectors as the basis vectors
-	newA = A %*% evec_dominant;
-	write(newA, output+"/projected.data", format=ofmt);
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# 
+# This script performs Principal Component Analysis (PCA) on the given input data.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# INPUT  String ---      Location to read the matrix A of feature vectors
+# K      Int    ---      Indicates dimension of the new vector space constructed from eigen vectors
+# CENTER Int    0        Indicates whether or not to center data 
+# SCALE  Int    0        Indicates whether or not to scale data 
+# OFMT   String ---      Output data format
+# PROJDATA Int  0	     This argument indicates if the data should be projected or not
+# MODEL  String ---      Location to already existing model: eigenvectors and eigenvalues 
+# OUTPUT String /        Location to write output matrices (covariance matrix, new basis vectors, 
+#                           and data projected onto new basis vectors)
+# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 
+# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
+# ---------------------------------------------------------------------------------------------
+
+A = read($INPUT);
+K = ifdef($K, ncol(A));
+ofmt = ifdef($OFMT, "CSV");
+projectData = ifdef($PROJDATA,0);
+model = ifdef($MODEL,"");
+center = ifdef($CENTER,0);
+scale = ifdef($SCALE,0);
+output = ifdef($OUTPUT,"/");
+
+evec_dominant = matrix(0,cols=1,rows=1);
+
+if (model != "") {
+	# reuse existing model to project data
+    evec_dominant = read(model+"/dominant.eigen.vectors");
+}else{
+	if (model == "" ){
+		model = output;	
+	}	
+
+	N = nrow(A);
+	D = ncol(A);
+
+	# perform z-scoring (centering and scaling)
+	if (center == 1) {
+	    cm = colMeans(A);
+	    A = A - cm;
+	}
+	if (scale == 1) {
+	    cvars = (colSums (A^2));	
+	    if (center == 1){
+		cm = colMeans(A);
+	    	cvars = (cvars - N*(cm^2))/(N-1);		    
+ 	    }
+	    Azscored = (A)/sqrt(cvars);
+            A = Azscored;
+	}	
+
+	# co-variance matrix 
+	mu = colSums(A)/N;
+	C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+
+	# compute eigen vectors and values
+	[evalues, evectors] = eigen(C);
+
+	decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+	diagmat = table(seq(1,D),decreasing_Idx);
+	# sorts eigenvalues by decreasing order
+	evalues = diagmat %*% evalues;
+	# sorts eigenvectors column-wise in the order of decreasing eigenvalues
+	evectors = evectors %*% diagmat;
+
+
+	# select K dominant eigen vectors 
+	nvec = ncol(evectors);
+
+	eval_dominant = evalues[1:K, 1];
+	evec_dominant = evectors[,1:K];
+	
+	# the square root of eigenvalues
+	eval_stdev_dominant = sqrt(eval_dominant);
+	
+	write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
+	write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
+	write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
+}
+if (projectData == 1 | model != ""){
+	# Construct new data set by treating computed dominant eigenvectors as the basis vectors
+	newA = A %*% evec_dominant;
+	write(newA, output+"/projected.data", format=ofmt);
+}