You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:21 UTC
[45/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/MultiLogReg.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/MultiLogReg.dml b/scripts/algorithms/MultiLogReg.dml
index 0b18b9d..716678d 100644
--- a/scripts/algorithms/MultiLogReg.dml
+++ b/scripts/algorithms/MultiLogReg.dml
@@ -1,365 +1,365 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Solves Multinomial Logistic Regression using Trust Region methods.
-# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
-
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X String --- Location to read the matrix of feature vectors
-# Y String --- Location to read the matrix with category labels
-# B String --- Location to store estimated regression parameters (the betas)
-# Log String " " Location to write per-iteration variables for log/debugging purposes
-# icpt Int 0 Intercept presence, shifting and rescaling X columns:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
-# tol Double 0.000001 tolerance ("epsilon")
-# moi Int 100 max. number of outer (Newton) iterations
-# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max
-# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
-# --------------------------------------------------------------------------------------------
-# The largest label represents the baseline category; if label -1 or 0 is present, then it is
-# the baseline label (and it is converted to the largest label).
-#
-# The Log file, when requested, contains the following per-iteration variables in CSV format,
-# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
-#
-# NAME MEANING
-# -------------------------------------------------------------------------------------------
-# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows
-# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows
-# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration
-# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise
-# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point
-# OBJECTIVE The loss function we minimize (negative regularized log-likelihood)
-# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value
-# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation
-# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region
-# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored
-# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected)
-# TRUST_DELTA Updated trust region size, the "delta"
-# -------------------------------------------------------------------------------------------
-#
-# Script invocation example:
-# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
-# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileLog = ifdef ($Log, " ");
-fmtB = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0); # $icpt = 0;
-regularization = ifdef ($reg, 0.0); # $reg = 0.0;
-tol = ifdef ($tol, 0.000001); # $tol = 0.000001;
-maxiter = ifdef ($moi, 100); # $moi = 100;
-maxinneriter = ifdef ($mii, 0); # $mii = 0;
-
-tol = as.double (tol);
-
-print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
-print ("Reading X...");
-X = read (fileX);
-print ("Reading Y...");
-Y_vec = read (fileY);
-
-eta0 = 0.0001;
-eta1 = 0.25;
-eta2 = 0.75;
-sigma1 = 0.25;
-sigma2 = 0.5;
-sigma3 = 4.0;
-psi = 0.1;
-
-N = nrow (X);
-D = ncol (X);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-if (intercept_status == 1 | intercept_status == 2) # add the intercept column
-{
- X = append (X, matrix (1, rows = N, cols = 1));
- D = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = D, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
- scale_lambda [D, 1] = 0;
-}
-
-if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
-{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
- avg_X_cols = t(colSums(X)) / N;
- var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
- is_unsafe = ppred (var_X_cols, 0.0, "<=");
- scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
- scale_X [D, 1] = 1;
- shift_X = - avg_X_cols * scale_X;
- shift_X [D, 1] = 0;
- rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
-} else {
- scale_X = matrix (1, rows = D, cols = 1);
- shift_X = matrix (0, rows = D, cols = 1);
- rowSums_X_sq = rowSums (X ^ 2);
-}
-
-# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
-# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
-# The transform is then associatively applied to the other side of the expression,
-# and is rewritten via "scale_X" and "shift_X" as follows:
-#
-# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# ssX_A = diag (scale_X) %*% A;
-# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
-
-# Convert "Y_vec" into indicator matrice:
-if (min (Y_vec) <= 0) {
- # Category labels "0", "-1" etc. are converted into the largest label
- max_y = max (Y_vec);
- Y_vec = Y_vec + (- Y_vec + max_y + 1) * ppred (Y_vec , 0.0, "<=");
-}
-Y = table (seq (1, N, 1), Y_vec);
-K = ncol (Y) - 1; # The number of non-baseline categories
-
-lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
-delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
-
-B = matrix (0, rows = D, cols = K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
- ### LT = append (LT, matrix (0, rows = N, cols = 1));
- ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
-P = matrix (1, rows = N, cols = K+1); ### exp_LT = exp (LT);
-P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
-obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
-
-Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
-if (intercept_status == 2) {
- Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
-}
-Grad = Grad + lambda * B;
-norm_Grad = sqrt (sum (Grad ^ 2));
-norm_Grad_initial = norm_Grad;
-
-if (maxinneriter == 0) {
- maxinneriter = D * K;
-}
-iter = 1;
-
-# boolean for convergence check
-converge = (norm_Grad < tol) | (iter > maxiter);
-
-print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta);
-
-if (fileLog != " ") {
- log_str = "OBJECTIVE,0," + obj;
- log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
- log_str = append (log_str, "TRUST_DELTA,0," + delta);
-} else {
- log_str = " ";
-}
-
-while (! converge)
-{
- # SOLVE TRUST REGION SUB-PROBLEM
- S = matrix (0, rows = D, cols = K);
- R = - Grad;
- V = R;
- delta2 = delta ^ 2;
- inneriter = 1;
- norm_R2 = sum (R ^ 2);
- innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
- is_trust_boundary_reached = 0;
-
- while (! innerconverge)
- {
- if (intercept_status == 2) {
- ssX_V = diag (scale_X) %*% V;
- ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
- } else {
- ssX_V = V;
- }
- Q = P [, 1:K] * (X %*% ssX_V);
- HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
- if (intercept_status == 2) {
- HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
- }
- HV = HV + lambda * V;
- alpha = norm_R2 / sum (V * HV);
- Snew = S + alpha * V;
- norm_Snew2 = sum (Snew ^ 2);
- if (norm_Snew2 <= delta2)
- {
- S = Snew;
- R = R - alpha * HV;
- old_norm_R2 = norm_R2
- norm_R2 = sum (R ^ 2);
- V = R + (norm_R2 / old_norm_R2) * V;
- innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
- } else {
- is_trust_boundary_reached = 1;
- sv = sum (S * V);
- v2 = sum (V ^ 2);
- s2 = sum (S ^ 2);
- rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
- if (sv >= 0) {
- alpha = (delta2 - s2) / (sv + rad);
- } else {
- alpha = (rad - sv) / v2;
- }
- S = S + alpha * V;
- R = R - alpha * HV;
- innerconverge = TRUE;
- }
- inneriter = inneriter + 1;
- innerconverge = innerconverge | (inneriter > maxinneriter);
- }
-
- # END TRUST REGION SUB-PROBLEM
-
- # compute rho, update B, obtain delta
- gs = sum (S * Grad);
- qk = - 0.5 * (gs - sum (S * R));
- B_new = B + S;
- if (intercept_status == 2) {
- ssX_B_new = diag (scale_X) %*% B_new;
- ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
- } else {
- ssX_B_new = B_new;
- }
-
- LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
- if (fileLog != " ") {
- log_str = append (log_str, "LINEAR_TERM_MIN," + iter + "," + min (LT));
- log_str = append (log_str, "LINEAR_TERM_MAX," + iter + "," + max (LT));
- }
- LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
- exp_LT = exp (LT);
- P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
- obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
-
- # Consider updating LT in the inner loop
- # Consider the big "obj" and "obj_new" rounding-off their small difference below:
-
- actred = (obj - obj_new);
-
- rho = actred / qk;
- is_rho_accepted = (rho > eta0);
- snorm = sqrt (sum (S ^ 2));
-
- if (fileLog != " ") {
- log_str = append (log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1));
- log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
- log_str = append (log_str, "POINT_STEP_NORM," + iter + "," + snorm);
- log_str = append (log_str, "OBJECTIVE," + iter + "," + obj_new);
- log_str = append (log_str, "OBJ_DROP_REAL," + iter + "," + actred);
- log_str = append (log_str, "OBJ_DROP_PRED," + iter + "," + qk);
- log_str = append (log_str, "OBJ_DROP_RATIO," + iter + "," + rho);
- }
-
- if (iter == 1) {
- delta = min (delta, snorm);
- }
-
- alpha2 = obj_new - obj - gs;
- if (alpha2 <= 0) {
- alpha = sigma3;
- }
- else {
- alpha = max (sigma1, -0.5 * gs / alpha2);
- }
-
- if (rho < eta0) {
- delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
- }
- else {
- if (rho < eta1) {
- delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
- }
- else {
- if (rho < eta2) {
- delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
- }
- else {
- delta = max (delta, min (alpha * snorm, sigma3 * delta));
- }
- }
- }
-
- if (is_trust_boundary_reached == 1)
- {
- print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
- } else {
- print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
- }
- print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk +
- " (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta);
-
- if (is_rho_accepted)
- {
- B = B_new;
- P = P_new;
- Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
- if (intercept_status == 2) {
- Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
- }
- Grad = Grad + lambda * B;
- norm_Grad = sqrt (sum (Grad ^ 2));
- obj = obj_new;
- print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad);
- if (fileLog != " ") {
- log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
- log_str = append (log_str, "GRADIENT_NORM," + iter + "," + norm_Grad);
- }
- } else {
- if (fileLog != " ") {
- log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
- }
- }
-
- if (fileLog != " ") {
- log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
- }
-
- iter = iter + 1;
- converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
- ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
- if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
-}
-
-if (intercept_status == 2) {
- B_out = diag (scale_X) %*% B;
- B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
-} else {
- B_out = B;
-}
-write (B_out, fileB, format=fmtB);
-
-if (fileLog != " ") {
- write (log_str, fileLog);
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Solves Multinomial Logistic Regression using Trust Region methods.
+# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
+
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# --------------------------------------------------------------------------------------------
+# X String --- Location to read the matrix of feature vectors
+# Y String --- Location to read the matrix with category labels
+# B String --- Location to store estimated regression parameters (the betas)
+# Log String " " Location to write per-iteration variables for log/debugging purposes
+# icpt Int 0 Intercept presence, shifting and rescaling X columns:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
+# tol Double 0.000001 tolerance ("epsilon")
+# moi Int 100 max. number of outer (Newton) iterations
+# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max
+# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
+# --------------------------------------------------------------------------------------------
+# The largest label represents the baseline category; if label -1 or 0 is present, then it is
+# the baseline label (and it is converted to the largest label).
+#
+# The Log file, when requested, contains the following per-iteration variables in CSV format,
+# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
+#
+# NAME MEANING
+# -------------------------------------------------------------------------------------------
+# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows
+# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows
+# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration
+# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise
+# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point
+# OBJECTIVE The loss function we minimize (negative regularized log-likelihood)
+# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value
+# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation
+# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region
+# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored
+# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected)
+# TRUST_DELTA Updated trust region size, the "delta"
+# -------------------------------------------------------------------------------------------
+#
+# Script invocation example:
+# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20
+# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileLog = ifdef ($Log, " ");
+fmtB = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0); # $icpt = 0;
+regularization = ifdef ($reg, 0.0); # $reg = 0.0;
+tol = ifdef ($tol, 0.000001); # $tol = 0.000001;
+maxiter = ifdef ($moi, 100); # $moi = 100;
+maxinneriter = ifdef ($mii, 0); # $mii = 0;
+
+tol = as.double (tol);
+
+print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
+print ("Reading X...");
+X = read (fileX);
+print ("Reading Y...");
+Y_vec = read (fileY);
+
+eta0 = 0.0001;
+eta1 = 0.25;
+eta2 = 0.75;
+sigma1 = 0.25;
+sigma2 = 0.5;
+sigma3 = 4.0;
+psi = 0.1;
+
+N = nrow (X);
+D = ncol (X);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+if (intercept_status == 1 | intercept_status == 2) # add the intercept column
+{
+ X = append (X, matrix (1, rows = N, cols = 1));
+ D = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = D, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+ scale_lambda [D, 1] = 0;
+}
+
+if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
+{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
+ avg_X_cols = t(colSums(X)) / N;
+ var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
+ is_unsafe = ppred (var_X_cols, 0.0, "<=");
+ scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+ scale_X [D, 1] = 1;
+ shift_X = - avg_X_cols * scale_X;
+ shift_X [D, 1] = 0;
+ rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+ scale_X = matrix (1, rows = D, cols = 1);
+ shift_X = matrix (0, rows = D, cols = 1);
+ rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# ssX_A = diag (scale_X) %*% A;
+# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
+
+# Convert "Y_vec" into indicator matrice:
+if (min (Y_vec) <= 0) {
+ # Category labels "0", "-1" etc. are converted into the largest label
+ max_y = max (Y_vec);
+ Y_vec = Y_vec + (- Y_vec + max_y + 1) * ppred (Y_vec , 0.0, "<=");
+}
+Y = table (seq (1, N, 1), Y_vec);
+K = ncol (Y) - 1; # The number of non-baseline categories
+
+lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
+
+B = matrix (0, rows = D, cols = K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
+ ### LT = append (LT, matrix (0, rows = N, cols = 1));
+ ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+P = matrix (1, rows = N, cols = K+1); ### exp_LT = exp (LT);
+P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+
+Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+if (intercept_status == 2) {
+ Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+}
+Grad = Grad + lambda * B;
+norm_Grad = sqrt (sum (Grad ^ 2));
+norm_Grad_initial = norm_Grad;
+
+if (maxinneriter == 0) {
+ maxinneriter = D * K;
+}
+iter = 1;
+
+# boolean for convergence check
+converge = (norm_Grad < tol) | (iter > maxiter);
+
+print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta);
+
+if (fileLog != " ") {
+ log_str = "OBJECTIVE,0," + obj;
+ log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad);
+ log_str = append (log_str, "TRUST_DELTA,0," + delta);
+} else {
+ log_str = " ";
+}
+
+while (! converge)
+{
+ # SOLVE TRUST REGION SUB-PROBLEM
+ S = matrix (0, rows = D, cols = K);
+ R = - Grad;
+ V = R;
+ delta2 = delta ^ 2;
+ inneriter = 1;
+ norm_R2 = sum (R ^ 2);
+ innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+ is_trust_boundary_reached = 0;
+
+ while (! innerconverge)
+ {
+ if (intercept_status == 2) {
+ ssX_V = diag (scale_X) %*% V;
+ ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
+ } else {
+ ssX_V = V;
+ }
+ Q = P [, 1:K] * (X %*% ssX_V);
+ HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
+ if (intercept_status == 2) {
+ HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
+ }
+ HV = HV + lambda * V;
+ alpha = norm_R2 / sum (V * HV);
+ Snew = S + alpha * V;
+ norm_Snew2 = sum (Snew ^ 2);
+ if (norm_Snew2 <= delta2)
+ {
+ S = Snew;
+ R = R - alpha * HV;
+ old_norm_R2 = norm_R2
+ norm_R2 = sum (R ^ 2);
+ V = R + (norm_R2 / old_norm_R2) * V;
+ innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+ } else {
+ is_trust_boundary_reached = 1;
+ sv = sum (S * V);
+ v2 = sum (V ^ 2);
+ s2 = sum (S ^ 2);
+ rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
+ if (sv >= 0) {
+ alpha = (delta2 - s2) / (sv + rad);
+ } else {
+ alpha = (rad - sv) / v2;
+ }
+ S = S + alpha * V;
+ R = R - alpha * HV;
+ innerconverge = TRUE;
+ }
+ inneriter = inneriter + 1;
+ innerconverge = innerconverge | (inneriter > maxinneriter);
+ }
+
+ # END TRUST REGION SUB-PROBLEM
+
+ # compute rho, update B, obtain delta
+ gs = sum (S * Grad);
+ qk = - 0.5 * (gs - sum (S * R));
+ B_new = B + S;
+ if (intercept_status == 2) {
+ ssX_B_new = diag (scale_X) %*% B_new;
+ ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
+ } else {
+ ssX_B_new = B_new;
+ }
+
+ LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
+ if (fileLog != " ") {
+ log_str = append (log_str, "LINEAR_TERM_MIN," + iter + "," + min (LT));
+ log_str = append (log_str, "LINEAR_TERM_MAX," + iter + "," + max (LT));
+ }
+ LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+ exp_LT = exp (LT);
+ P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+ obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+
+ # Consider updating LT in the inner loop
+ # Consider the big "obj" and "obj_new" rounding-off their small difference below:
+
+ actred = (obj - obj_new);
+
+ rho = actred / qk;
+ is_rho_accepted = (rho > eta0);
+ snorm = sqrt (sum (S ^ 2));
+
+ if (fileLog != " ") {
+ log_str = append (log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1));
+ log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached);
+ log_str = append (log_str, "POINT_STEP_NORM," + iter + "," + snorm);
+ log_str = append (log_str, "OBJECTIVE," + iter + "," + obj_new);
+ log_str = append (log_str, "OBJ_DROP_REAL," + iter + "," + actred);
+ log_str = append (log_str, "OBJ_DROP_PRED," + iter + "," + qk);
+ log_str = append (log_str, "OBJ_DROP_RATIO," + iter + "," + rho);
+ }
+
+ if (iter == 1) {
+ delta = min (delta, snorm);
+ }
+
+ alpha2 = obj_new - obj - gs;
+ if (alpha2 <= 0) {
+ alpha = sigma3;
+ }
+ else {
+ alpha = max (sigma1, -0.5 * gs / alpha2);
+ }
+
+ if (rho < eta0) {
+ delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
+ }
+ else {
+ if (rho < eta1) {
+ delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
+ }
+ else {
+ if (rho < eta2) {
+ delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
+ }
+ else {
+ delta = max (delta, min (alpha * snorm, sigma3 * delta));
+ }
+ }
+ }
+
+ if (is_trust_boundary_reached == 1)
+ {
+ print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
+ } else {
+ print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
+ }
+ print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk +
+ " (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta);
+
+ if (is_rho_accepted)
+ {
+ B = B_new;
+ P = P_new;
+ Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+ if (intercept_status == 2) {
+ Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+ }
+ Grad = Grad + lambda * B;
+ norm_Grad = sqrt (sum (Grad ^ 2));
+ obj = obj_new;
+ print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad);
+ if (fileLog != " ") {
+ log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1");
+ log_str = append (log_str, "GRADIENT_NORM," + iter + "," + norm_Grad);
+ }
+ } else {
+ if (fileLog != " ") {
+ log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0");
+ }
+ }
+
+ if (fileLog != " ") {
+ log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta);
+ }
+
+ iter = iter + 1;
+ converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
+ ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
+ if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
+}
+
+if (intercept_status == 2) {
+ B_out = diag (scale_X) %*% B;
+ B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
+} else {
+ B_out = B;
+}
+write (B_out, fileB, format=fmtB);
+
+if (fileLog != " ") {
+ write (log_str, fileLog);
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/PCA.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml
index e8b4aec..65800b7 100644
--- a/scripts/algorithms/PCA.dml
+++ b/scripts/algorithms/PCA.dml
@@ -1,112 +1,112 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# This script performs Principal Component Analysis (PCA) on the given input data.
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# INPUT String --- Location to read the matrix A of feature vectors
-# K Int --- Indicates dimension of the new vector space constructed from eigen vectors
-# CENTER Int 0 Indicates whether or not to center data
-# SCALE Int 0 Indicates whether or not to scale data
-# OFMT String --- Output data format
-# PROJDATA Int 0 This argument indicates if the data should be projected or not
-# MODEL String --- Location to already existing model: eigenvectors and eigenvalues
-# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors,
-# and data projected onto new basis vectors)
-# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000
-# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
-# ---------------------------------------------------------------------------------------------
-
-A = read($INPUT);
-K = ifdef($K, ncol(A));
-ofmt = ifdef($OFMT, "CSV");
-projectData = ifdef($PROJDATA,0);
-model = ifdef($MODEL,"");
-center = ifdef($CENTER,0);
-scale = ifdef($SCALE,0);
-output = ifdef($OUTPUT,"/");
-
-evec_dominant = matrix(0,cols=1,rows=1);
-
-if (model != "") {
- # reuse existing model to project data
- evec_dominant = read(model+"/dominant.eigen.vectors");
-}else{
- if (model == "" ){
- model = output;
- }
-
- N = nrow(A);
- D = ncol(A);
-
- # perform z-scoring (centering and scaling)
- if (center == 1) {
- cm = colMeans(A);
- A = A - cm;
- }
- if (scale == 1) {
- cvars = (colSums (A^2));
- if (center == 1){
- cm = colMeans(A);
- cvars = (cvars - N*(cm^2))/(N-1);
- }
- Azscored = (A)/sqrt(cvars);
- A = Azscored;
- }
-
- # co-variance matrix
- mu = colSums(A)/N;
- C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
-
-
- # compute eigen vectors and values
- [evalues, evectors] = eigen(C);
-
- decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
- diagmat = table(seq(1,D),decreasing_Idx);
- # sorts eigenvalues by decreasing order
- evalues = diagmat %*% evalues;
- # sorts eigenvectors column-wise in the order of decreasing eigenvalues
- evectors = evectors %*% diagmat;
-
-
- # select K dominant eigen vectors
- nvec = ncol(evectors);
-
- eval_dominant = evalues[1:K, 1];
- evec_dominant = evectors[,1:K];
-
- # the square root of eigenvalues
- eval_stdev_dominant = sqrt(eval_dominant);
-
- write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
- write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
- write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
-}
-if (projectData == 1 | model != ""){
- # Construct new data set by treating computed dominant eigenvectors as the basis vectors
- newA = A %*% evec_dominant;
- write(newA, output+"/projected.data", format=ofmt);
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# This script performs Principal Component Analysis (PCA) on the given input data.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# INPUT String --- Location to read the matrix A of feature vectors
+# K Int --- Indicates dimension of the new vector space constructed from eigen vectors
+# CENTER Int 0 Indicates whether or not to center data
+# SCALE Int 0 Indicates whether or not to scale data
+# OFMT String --- Output data format
+# PROJDATA Int 0 This argument indicates if the data should be projected or not
+# MODEL String --- Location to already existing model: eigenvectors and eigenvalues
+# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors,
+# and data projected onto new basis vectors)
+# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000
+# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
+# ---------------------------------------------------------------------------------------------
+
+A = read($INPUT);
+K = ifdef($K, ncol(A));
+ofmt = ifdef($OFMT, "CSV");
+projectData = ifdef($PROJDATA,0);
+model = ifdef($MODEL,"");
+center = ifdef($CENTER,0);
+scale = ifdef($SCALE,0);
+output = ifdef($OUTPUT,"/");
+
+evec_dominant = matrix(0,cols=1,rows=1);
+
+if (model != "") {
+ # reuse existing model to project data
+ evec_dominant = read(model+"/dominant.eigen.vectors");
+}else{
+ if (model == "" ){
+ model = output;
+ }
+
+ N = nrow(A);
+ D = ncol(A);
+
+ # perform z-scoring (centering and scaling)
+ if (center == 1) {
+ cm = colMeans(A);
+ A = A - cm;
+ }
+ if (scale == 1) {
+ cvars = (colSums (A^2));
+ if (center == 1){
+ cm = colMeans(A);
+ cvars = (cvars - N*(cm^2))/(N-1);
+ }
+ Azscored = (A)/sqrt(cvars);
+ A = Azscored;
+ }
+
+ # co-variance matrix
+ mu = colSums(A)/N;
+ C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
+
+
+ # compute eigen vectors and values
+ [evalues, evectors] = eigen(C);
+
+ decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
+ diagmat = table(seq(1,D),decreasing_Idx);
+ # sorts eigenvalues by decreasing order
+ evalues = diagmat %*% evalues;
+ # sorts eigenvectors column-wise in the order of decreasing eigenvalues
+ evectors = evectors %*% diagmat;
+
+
+ # select K dominant eigen vectors
+ nvec = ncol(evectors);
+
+ eval_dominant = evalues[1:K, 1];
+ evec_dominant = evectors[,1:K];
+
+ # the square root of eigenvalues
+ eval_stdev_dominant = sqrt(eval_dominant);
+
+ write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
+ write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
+ write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
+}
+if (projectData == 1 | model != ""){
+ # Construct new data set by treating computed dominant eigenvectors as the basis vectors
+ newA = A %*% evec_dominant;
+ write(newA, output+"/projected.data", format=ofmt);
+}