You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/26 02:13:07 UTC
[43/55] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/StepLinearRegDS.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/StepLinearRegDS.dml b/scripts/algorithms/StepLinearRegDS.dml
index afd94ed..953402f 100644
--- a/scripts/algorithms/StepLinearRegDS.dml
+++ b/scripts/algorithms/StepLinearRegDS.dml
@@ -1,388 +1,388 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
-# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
-#
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X String --- Location (on HDFS) to read the matrix X of feature vectors
-# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
-# B String --- Location to store estimated regression parameters (the betas)
-# S String --- Location to write the selected features ordered as computed by the algorithm
-# O String " " Location to write the printed statistics; by default is standard output
-# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
-# no further features are being checked and the algorithm stops
-# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
-# --------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
-# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
-# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME MEANING
-# -------------------------------------------------------------------------------------
-# AVG_TOT_Y Average of the response value Y
-# STDEV_TOT_Y Standard Deviation of the response value Y
-# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
-# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
-# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
-# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
-# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
-# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
-# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
-# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
-# -------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-# If the best AIC is achieved without any features the matrix of selected features contains 0.
-# Moreover, in this case no further statistics will be produced
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
-# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileS = $S;
-
-# currently only the forward selection strategy in supported: start from one feature and iteratively add
-# features until AIC improves
-dir = "forward";
-
-fmt = ifdef ($fmt, "text");
-intercept_status = ifdef ($icpt, 0);
-thr = ifdef ($thr, 0.01);
-
-print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X_orig = read (fileX);
-y = read (fileY);
-
-n = nrow (X_orig);
-m_orig = ncol (X_orig);
-
-# BEGIN STEPWISE LINEAR REGRESSION
-
-if (dir == "forward") {
-
- continue = TRUE;
- columns_fixed = matrix (0, rows = 1, cols = m_orig);
- columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
-
- # X_global stores the best model found at each step
- X_global = matrix (0, rows = n, cols = 1);
-
- if (intercept_status == 1 | intercept_status == 2) {
- beta = mean (y);
- AIC_best = 2 + n * log(sum((beta - y)^2) / n);
- } else {
- beta = 0;
- AIC_best = n * log(sum(y^2) / n);
- }
- AICs = matrix (AIC_best, rows = 1, cols = m_orig);
- print ("Best AIC without any features: " + AIC_best);
-
- # First pass to examine single features
- parfor (i in 1:m_orig) {
- [AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");
- AICs[1,i] = AIC_1;
- }
-
- # Determine the best AIC
- column_best = 0;
- for (k in 1:m_orig) {
- AIC_cur = as.scalar (AICs[1,k]);
- if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
- column_best = k;
- AIC_best = as.scalar(AICs[1,k]);
- }
- }
-
- if (column_best == 0) {
- print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
- S = matrix (0, rows=1, cols=1);
- if (intercept_status == 0) {
- B = matrix (beta, rows = m_orig, cols = 1);
- } else {
- B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
- B_tmp[m_orig + 1,] = beta;
- B = B_tmp;
- }
- write (S, fileS, format=fmt);
- write (B, fileB, format=fmt);
- stop ("");
- }
- print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
- columns_fixed[1,column_best] = 1;
- columns_fixed_ordered[1,1] = column_best;
- X_global = X_orig[,column_best];
-
- while (continue) {
- # Subsequent passes over the features
- parfor (i in 1:m_orig) {
- if (as.scalar(columns_fixed[1,i]) == 0) {
-
- # Construct the feature matrix
- X = append (X_global, X_orig[,i]);
-
- [AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
- AICs[1,i] = AIC_2;
- }
- }
-
- # Determine the best AIC
- for (k in 1:m_orig) {
- AIC_cur = as.scalar (AICs[1,k]);
- if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
- column_best = k;
- AIC_best = as.scalar(AICs[1,k]);
- }
- }
-
- # Append best found features (i.e., columns) to X_global
- if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
- print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
- columns_fixed[1,column_best] = 1;
- columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
- if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
- X_global = append (X_global, X_orig[,column_best]);
- continue = FALSE;
- } else {
- X_global = append (X_global, X_orig[,column_best]);
- }
- } else {
- continue = FALSE;
- }
- }
-
- # run linear regression with selected set of features
- print ("Running linear regression with selected features...");
- [AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB);
-
-} else {
- stop ("Currently only forward selection strategy is supported!");
-}
-
-
-/*
-* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
-* It also outputs the AIC of the computed model.
-*/
-linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
-
- intercept_status = ifdef ($icpt, 0);
- n = nrow (X);
- m = ncol (X);
-
- # Introduce the intercept, shift and rescale the columns of X if needed
- if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
- ones_n = matrix (1, rows = n, cols = 1);
- X = append (X, ones_n);
- m = m - 1;
- }
- m_ext = ncol(X);
-
- if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
- # Important assumption: X [, m_ext] = ones_n
- avg_X_cols = t(colSums(X)) / n;
- var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
- is_unsafe = ppred (var_X_cols, 0.0, "<=");
- scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
- scale_X [m_ext, 1] = 1;
- shift_X = - avg_X_cols * scale_X;
- shift_X [m_ext, 1] = 0;
- } else {
- scale_X = matrix (1, rows = m_ext, cols = 1);
- shift_X = matrix (0, rows = m_ext, cols = 1);
- }
-
- # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-
- A = t(X) %*% X;
- b = t(X) %*% y;
- if (intercept_status == 2) {
- A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
- A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
- b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
- }
-
- beta_unscaled = solve (A, b);
-
- # END THE DIRECT SOLVE ALGORITHM
-
- if (intercept_status == 2) {
- beta = scale_X * beta_unscaled;
- beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
- } else {
- beta = beta_unscaled;
- }
-
- # COMPUTE AIC
- y_residual = y - X %*% beta;
- ss_res = sum (y_residual ^ 2);
- eq_deg_of_freedom = m_ext;
- AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
-
- if (fileB != " ") {
-
- fileO = ifdef ($O, " ");
- fileS = $S;
-
- print ("Computing the statistics...");
- avg_tot = sum (y) / n;
- ss_tot = sum (y ^ 2);
- ss_avg_tot = ss_tot - n * avg_tot ^ 2;
- var_tot = ss_avg_tot / (n - 1);
-# y_residual = y - X %*% beta;
- avg_res = sum (y_residual) / n;
-# ss_res = sum (y_residual ^ 2);
- ss_avg_res = ss_res - n * avg_res ^ 2;
-
- plain_R2 = 1 - ss_res / ss_avg_tot;
- if (n > m_ext) {
- dispersion = ss_res / (n - m_ext);
- adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
- } else {
- dispersion = 0.0 / 0.0;
- adjusted_R2 = 0.0 / 0.0;
- }
-
- plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
- deg_freedom = n - m - 1;
- if (deg_freedom > 0) {
- var_res = ss_avg_res / deg_freedom;
- adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
- } else {
- var_res = 0.0 / 0.0;
- adjusted_R2_nobias = 0.0 / 0.0;
- print ("Warning: zero or negative number of degrees of freedom.");
- }
-
- plain_R2_vs_0 = 1 - ss_res / ss_tot;
- if (n > m) {
- adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
- } else {
- adjusted_R2_vs_0 = 0.0 / 0.0;
- }
-
- str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
- str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
- str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
- str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
- str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
- str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
- str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
- str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
- str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
- if (intercept_status == 0) {
- str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
- str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
- }
-
- if (fileO != " ") {
- write (str, fileO);
- } else {
- print (str);
- }
-
- # Prepare the output matrix
- print ("Writing the output matrix...");
- if (intercept_status == 2) {
- beta_out = append (beta, beta_unscaled);
- } else {
- beta_out = beta;
- }
-
- # Output which features give the best AIC and are being used for linear regression
- write (Selected, fileS, format=fmt);
-
- no_selected = ncol (Selected);
- max_selected = max (Selected);
- last = max_selected + 1;
-
- if (intercept_status != 0) {
-
- Selected_ext = append (Selected, as.matrix (last));
- P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));
-
- if (intercept_status == 2) {
-
- P1_beta = P1 * beta;
- P2_beta = colSums (P1_beta);
- P1_beta_unscaled = P1 * beta_unscaled;
- P2_beta_unscaled = colSums(P1_beta_unscaled);
-
- if (max_selected < m_orig) {
- P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
- P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
-
- P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1];
- P2_beta[1, max_selected + 1] = 0;
-
- P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1];
- P2_beta_unscaled[1, max_selected + 1] = 0;
- }
- beta_out = append (t(P2_beta), t(P2_beta_unscaled));
-
- } else {
-
- P1_beta = P1 * beta;
- P2_beta = colSums (P1_beta);
-
- if (max_selected < m_orig) {
- P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
- P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ;
- P2_beta[1, max_selected + 1] = 0;
- }
- beta_out = t(P2_beta);
-
- }
- } else {
-
- P1 = table (seq (1, no_selected), t(Selected));
- P1_beta = P1 * beta;
- P2_beta = colSums (P1_beta);
-
- if (max_selected < m_orig) {
- P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
- }
-
- beta_out = t(P2_beta);
- }
-
- write ( beta_out, fileB, format=fmt );
- }
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
+# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# --------------------------------------------------------------------------------------------
+# X String --- Location (on HDFS) to read the matrix X of feature vectors
+# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
+# B String --- Location to store estimated regression parameters (the betas)
+# S String --- Location to write the selected features ordered as computed by the algorithm
+# O String " " Location to write the printed statistics; by default is standard output
+# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
+# no further features are being checked and the algorithm stops
+# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
+# --------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
+# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
+# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME MEANING
+# -------------------------------------------------------------------------------------
+# AVG_TOT_Y Average of the response value Y
+# STDEV_TOT_Y Standard Deviation of the response value Y
+# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
+# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
+# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
+# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
+# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
+# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
+# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
+# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
+# -------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+# If the best AIC is achieved without any features the matrix of selected features contains 0.
+# Moreover, in this case no further statistics will be produced
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
+# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileS = $S;
+
+# currently only the forward selection strategy in supported: start from one feature and iteratively add
+# features until AIC improves
+dir = "forward";
+
+fmt = ifdef ($fmt, "text");
+intercept_status = ifdef ($icpt, 0);
+thr = ifdef ($thr, 0.01);
+
+print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X_orig = read (fileX);
+y = read (fileY);
+
+n = nrow (X_orig);
+m_orig = ncol (X_orig);
+
+# BEGIN STEPWISE LINEAR REGRESSION
+
+if (dir == "forward") {
+
+ continue = TRUE;
+ columns_fixed = matrix (0, rows = 1, cols = m_orig);
+ columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
+
+ # X_global stores the best model found at each step
+ X_global = matrix (0, rows = n, cols = 1);
+
+ if (intercept_status == 1 | intercept_status == 2) {
+ beta = mean (y);
+ AIC_best = 2 + n * log(sum((beta - y)^2) / n);
+ } else {
+ beta = 0;
+ AIC_best = n * log(sum(y^2) / n);
+ }
+ AICs = matrix (AIC_best, rows = 1, cols = m_orig);
+ print ("Best AIC without any features: " + AIC_best);
+
+ # First pass to examine single features
+ parfor (i in 1:m_orig) {
+ [AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");
+ AICs[1,i] = AIC_1;
+ }
+
+ # Determine the best AIC
+ column_best = 0;
+ for (k in 1:m_orig) {
+ AIC_cur = as.scalar (AICs[1,k]);
+ if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
+ column_best = k;
+ AIC_best = as.scalar(AICs[1,k]);
+ }
+ }
+
+ if (column_best == 0) {
+ print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
+ S = matrix (0, rows=1, cols=1);
+ if (intercept_status == 0) {
+ B = matrix (beta, rows = m_orig, cols = 1);
+ } else {
+ B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
+ B_tmp[m_orig + 1,] = beta;
+ B = B_tmp;
+ }
+ write (S, fileS, format=fmt);
+ write (B, fileB, format=fmt);
+ stop ("");
+ }
+ print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+ columns_fixed[1,column_best] = 1;
+ columns_fixed_ordered[1,1] = column_best;
+ X_global = X_orig[,column_best];
+
+ while (continue) {
+ # Subsequent passes over the features
+ parfor (i in 1:m_orig) {
+ if (as.scalar(columns_fixed[1,i]) == 0) {
+
+ # Construct the feature matrix
+ X = append (X_global, X_orig[,i]);
+
+ [AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
+ AICs[1,i] = AIC_2;
+ }
+ }
+
+ # Determine the best AIC
+ for (k in 1:m_orig) {
+ AIC_cur = as.scalar (AICs[1,k]);
+ if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
+ column_best = k;
+ AIC_best = as.scalar(AICs[1,k]);
+ }
+ }
+
+ # Append best found features (i.e., columns) to X_global
+ if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
+ print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+ columns_fixed[1,column_best] = 1;
+ columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
+ if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
+ X_global = append (X_global, X_orig[,column_best]);
+ continue = FALSE;
+ } else {
+ X_global = append (X_global, X_orig[,column_best]);
+ }
+ } else {
+ continue = FALSE;
+ }
+ }
+
+ # run linear regression with selected set of features
+ print ("Running linear regression with selected features...");
+ [AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB);
+
+} else {
+ stop ("Currently only forward selection strategy is supported!");
+}
+
+
+/*
+* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
+* It also outputs the AIC of the computed model.
+*/
+linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
+
+ intercept_status = ifdef ($icpt, 0);
+ n = nrow (X);
+ m = ncol (X);
+
+ # Introduce the intercept, shift and rescale the columns of X if needed
+ if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
+ ones_n = matrix (1, rows = n, cols = 1);
+ X = append (X, ones_n);
+ m = m - 1;
+ }
+ m_ext = ncol(X);
+
+ if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
+ # Important assumption: X [, m_ext] = ones_n
+ avg_X_cols = t(colSums(X)) / n;
+ var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+ is_unsafe = ppred (var_X_cols, 0.0, "<=");
+ scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+ scale_X [m_ext, 1] = 1;
+ shift_X = - avg_X_cols * scale_X;
+ shift_X [m_ext, 1] = 0;
+ } else {
+ scale_X = matrix (1, rows = m_ext, cols = 1);
+ shift_X = matrix (0, rows = m_ext, cols = 1);
+ }
+
+ # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
+
+ A = t(X) %*% X;
+ b = t(X) %*% y;
+ if (intercept_status == 2) {
+ A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
+ A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+ b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
+ }
+
+ beta_unscaled = solve (A, b);
+
+ # END THE DIRECT SOLVE ALGORITHM
+
+ if (intercept_status == 2) {
+ beta = scale_X * beta_unscaled;
+ beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+ } else {
+ beta = beta_unscaled;
+ }
+
+ # COMPUTE AIC
+ y_residual = y - X %*% beta;
+ ss_res = sum (y_residual ^ 2);
+ eq_deg_of_freedom = m_ext;
+ AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
+
+ if (fileB != " ") {
+
+ fileO = ifdef ($O, " ");
+ fileS = $S;
+
+ print ("Computing the statistics...");
+ avg_tot = sum (y) / n;
+ ss_tot = sum (y ^ 2);
+ ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+ var_tot = ss_avg_tot / (n - 1);
+# y_residual = y - X %*% beta;
+ avg_res = sum (y_residual) / n;
+# ss_res = sum (y_residual ^ 2);
+ ss_avg_res = ss_res - n * avg_res ^ 2;
+
+ plain_R2 = 1 - ss_res / ss_avg_tot;
+ if (n > m_ext) {
+ dispersion = ss_res / (n - m_ext);
+ adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+ } else {
+ dispersion = 0.0 / 0.0;
+ adjusted_R2 = 0.0 / 0.0;
+ }
+
+ plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+ deg_freedom = n - m - 1;
+ if (deg_freedom > 0) {
+ var_res = ss_avg_res / deg_freedom;
+ adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+ } else {
+ var_res = 0.0 / 0.0;
+ adjusted_R2_nobias = 0.0 / 0.0;
+ print ("Warning: zero or negative number of degrees of freedom.");
+ }
+
+ plain_R2_vs_0 = 1 - ss_res / ss_tot;
+ if (n > m) {
+ adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+ } else {
+ adjusted_R2_vs_0 = 0.0 / 0.0;
+ }
+
+ str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
+ str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
+ str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
+ str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
+ str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
+ str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
+ str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
+ str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
+ str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
+ if (intercept_status == 0) {
+ str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
+ str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
+ }
+
+ if (fileO != " ") {
+ write (str, fileO);
+ } else {
+ print (str);
+ }
+
+ # Prepare the output matrix
+ print ("Writing the output matrix...");
+ if (intercept_status == 2) {
+ beta_out = append (beta, beta_unscaled);
+ } else {
+ beta_out = beta;
+ }
+
+ # Output which features give the best AIC and are being used for linear regression
+ write (Selected, fileS, format=fmt);
+
+ no_selected = ncol (Selected);
+ max_selected = max (Selected);
+ last = max_selected + 1;
+
+ if (intercept_status != 0) {
+
+ Selected_ext = append (Selected, as.matrix (last));
+ P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));
+
+ if (intercept_status == 2) {
+
+ P1_beta = P1 * beta;
+ P2_beta = colSums (P1_beta);
+ P1_beta_unscaled = P1 * beta_unscaled;
+ P2_beta_unscaled = colSums(P1_beta_unscaled);
+
+ if (max_selected < m_orig) {
+ P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+ P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
+
+ P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1];
+ P2_beta[1, max_selected + 1] = 0;
+
+ P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1];
+ P2_beta_unscaled[1, max_selected + 1] = 0;
+ }
+ beta_out = append (t(P2_beta), t(P2_beta_unscaled));
+
+ } else {
+
+ P1_beta = P1 * beta;
+ P2_beta = colSums (P1_beta);
+
+ if (max_selected < m_orig) {
+ P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+ P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ;
+ P2_beta[1, max_selected + 1] = 0;
+ }
+ beta_out = t(P2_beta);
+
+ }
+ } else {
+
+ P1 = table (seq (1, no_selected), t(Selected));
+ P1_beta = P1 * beta;
+ P2_beta = colSums (P1_beta);
+
+ if (max_selected < m_orig) {
+ P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+ }
+
+ beta_out = t(P2_beta);
+ }
+
+ write ( beta_out, fileB, format=fmt );
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/Univar-Stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Univar-Stats.dml b/scripts/algorithms/Univar-Stats.dml
index abb3fea..62d6a28 100644
--- a/scripts/algorithms/Univar-Stats.dml
+++ b/scripts/algorithms/Univar-Stats.dml
@@ -1,150 +1,150 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# DML Script to compute univariate statistics for all attributes
-# in a given data set
-#
-# Three inputs:
-# $1) X - input data
-# $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
-# kind=1 for scale,
-# kind=2 for nominal,
-# kind=3 for ordinal
-#
-# One output:
-# $STATS) output directory in which following three statistics
-# files are created
-# + base.stats - matrix with all 17 statistics (14 scale,
-# 3 categorical) computed for all attributes
-# + categorical.counts - matrix in which each column
-# gives the category-wise counts for all categories in
-# that attribute
-#
-#
-
-A = read($X); # data file
-K = read($TYPES); # attribute kind file
-
-# number of features/attributes
-n = ncol(A);
-
-# number of data records
-m = nrow(A);
-
-# number of statistics
-numBaseStats = 17; # (14 scale stats, 3 categorical stats)
-
-max_kind = max(K);
-
-# matrices to store computed statistics
-baseStats = matrix(0, rows=numBaseStats, cols=n);
-
-# Compute max domain size among all categorical attributes
-maxs = colMaxs(A);
-maxDomainSize = max( ppred(K, 1, ">") * maxs );
-maxDomain = as.integer(maxDomainSize);
-
-
-parfor(i in 1:n, check=0) {
-
- # project out the i^th column
- F = A[,i];
-
- kind = castAsScalar(K[1,i]);
-
- if ( kind == 1 ) {
- #print("[" + i + "] Scale");
- # compute SCALE statistics on the projected column
- minimum = min(F);
- maximum = max(F);
- rng = maximum - minimum;
-
- mu = mean(F);
- m2 = moment(F, 2);
- m3 = moment(F, 3);
- m4 = moment(F, 4);
-
- var = m/(m-1.0)*m2;
- std_dev = sqrt(var);
- se = std_dev/sqrt(m);
- cv = std_dev/mu;
-
- g1 = m3/(std_dev^3);
- g2 = m4/(std_dev^4) - 3;
- #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) );
- se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );
-
- #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );
- se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );
-
- md = median(F); #quantile(F, 0.5);
- iqm = interQuartileMean(F);
-
- # place the computed statistics in output matrices
- baseStats[1,i] = minimum;
- baseStats[2,i] = maximum;
- baseStats[3,i] = rng;
-
- baseStats[4,i] = mu;
- baseStats[5,i] = var;
- baseStats[6,i] = std_dev;
- baseStats[7,i] = se;
- baseStats[8,i] = cv;
-
- baseStats[9,i] = g1;
- baseStats[10,i] = g2;
- baseStats[11,i] = se_g1;
- baseStats[12,i] = se_g2;
-
- baseStats[13,i] = md;
- baseStats[14,i] = iqm;
- }
- else {
- if (kind == 2 | kind == 3) {
- #print("[" + i + "] Categorical");
-
- # check if the categorical column has valid values
- minF = min(F);
- if (minF <=0) {
- print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
- }
- else {
- # compute CATEGORICAL statistics on the projected column
- num_cat = max(F); # number of categories
- cat_counts = table(F,1, maxDomain, 1); # counts for each category
-
- mode = rowIndexMax(t(cat_counts));
- mx = max(cat_counts)
- modeArr = ppred(cat_counts, mx, "==")
- numModes = sum(modeArr);
-
- # place the computed statistics in output matrices
- baseStats[15,i] = num_cat;
- baseStats[16,i] = mode;
- baseStats[17,i] = numModes;
- }
- }
- }
-}
-
-write(baseStats, $STATS);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# DML Script to compute univariate statistics for all attributes
+# in a given data set
+#
+# Three inputs:
+# $1) X - input data
+# $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
+# kind=1 for scale,
+# kind=2 for nominal,
+# kind=3 for ordinal
+#
+# One output:
+# $STATS) output directory in which following three statistics
+# files are created
+# + base.stats - matrix with all 17 statistics (14 scale,
+# 3 categorical) computed for all attributes
+# + categorical.counts - matrix in which each column
+# gives the category-wise counts for all categories in
+# that attribute
+#
+#
+
+A = read($X); # data file
+K = read($TYPES); # attribute kind file
+
+# number of features/attributes
+n = ncol(A);
+
+# number of data records
+m = nrow(A);
+
+# number of statistics
+numBaseStats = 17; # (14 scale stats, 3 categorical stats)
+
+max_kind = max(K);
+
+# matrices to store computed statistics
+baseStats = matrix(0, rows=numBaseStats, cols=n);
+
+# Compute max domain size among all categorical attributes
+maxs = colMaxs(A);
+maxDomainSize = max( ppred(K, 1, ">") * maxs );
+maxDomain = as.integer(maxDomainSize);
+
+
+parfor(i in 1:n, check=0) {
+
+ # project out the i^th column
+ F = A[,i];
+
+ kind = castAsScalar(K[1,i]);
+
+ if ( kind == 1 ) {
+ #print("[" + i + "] Scale");
+ # compute SCALE statistics on the projected column
+ minimum = min(F);
+ maximum = max(F);
+ rng = maximum - minimum;
+
+ mu = mean(F);
+ m2 = moment(F, 2);
+ m3 = moment(F, 3);
+ m4 = moment(F, 4);
+
+ var = m/(m-1.0)*m2;
+ std_dev = sqrt(var);
+ se = std_dev/sqrt(m);
+ cv = std_dev/mu;
+
+ g1 = m3/(std_dev^3);
+ g2 = m4/(std_dev^4) - 3;
+ #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) );
+ se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );
+
+ #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );
+ se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );
+
+ md = median(F); #quantile(F, 0.5);
+ iqm = interQuartileMean(F);
+
+ # place the computed statistics in output matrices
+ baseStats[1,i] = minimum;
+ baseStats[2,i] = maximum;
+ baseStats[3,i] = rng;
+
+ baseStats[4,i] = mu;
+ baseStats[5,i] = var;
+ baseStats[6,i] = std_dev;
+ baseStats[7,i] = se;
+ baseStats[8,i] = cv;
+
+ baseStats[9,i] = g1;
+ baseStats[10,i] = g2;
+ baseStats[11,i] = se_g1;
+ baseStats[12,i] = se_g2;
+
+ baseStats[13,i] = md;
+ baseStats[14,i] = iqm;
+ }
+ else {
+ if (kind == 2 | kind == 3) {
+ #print("[" + i + "] Categorical");
+
+ # check if the categorical column has valid values
+ minF = min(F);
+ if (minF <=0) {
+ print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
+ }
+ else {
+ # compute CATEGORICAL statistics on the projected column
+ num_cat = max(F); # number of categories
+ cat_counts = table(F,1, maxDomain, 1); # counts for each category
+
+ mode = rowIndexMax(t(cat_counts));
+ mx = max(cat_counts)
+ modeArr = ppred(cat_counts, mx, "==")
+ numModes = sum(modeArr);
+
+ # place the computed statistics in output matrices
+ baseStats[15,i] = num_cat;
+ baseStats[16,i] = mode;
+ baseStats[17,i] = numModes;
+ }
+ }
+ }
+}
+
+write(baseStats, $STATS);
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/bivar-stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/bivar-stats.dml b/scripts/algorithms/bivar-stats.dml
index 4846f56..99549dc 100644
--- a/scripts/algorithms/bivar-stats.dml
+++ b/scripts/algorithms/bivar-stats.dml
@@ -1,398 +1,398 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-#
-# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
-# Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n}
-# compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
-#
-# Six inputs:
-# 1) X - input data
-# 2) index1 - First attribute set {A_11, A_12, ... A_1m}
-# 3) index2 - Second attribute set {A_21, A_22, ... A_2n}
-# 4) types1 - kind for attributes in S1
-# 5) types2 - kind for attributes in S2
-# kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
-#
-# One output:
-# 6) output directory in which following (maximum of) four statistics files are created
-# + bivar.scale.scale.stats - matrix containing scale-scale correlations
-# + bivar.nominal.nominal.stats -
-# + bivar.nominal.scale.stats -
-# + bivar.ordinal.ordinal.stats -
-#
-# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data>
-# index1=<Feature Index Set 1>
-# index2=<Feature Index Set 2>
-# types1=<Feature Types 1>
-# types2=<Feature Types 2>
-# OUTDIR=<Output Location>
-
-D = read($X); # input data set
-S1 = read($index1); # attribute set 1
-S2 = read($index2); # attribute set 2
-K1 = read($types1); # kind for attributes in S1
-K2 = read($types2); # kind for attributes in S2
-
-s1size = ncol(S1);
-s2size = ncol(S2);
-numPairs = s1size * s2size;
-
-#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho
-# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test
-
-num_scale_scale_tests = 0
-num_nominal_nominal_tests = 0
-num_ordinal_ordinal_tests = 0
-num_nominal_scale_tests = 0
-
-pair2row = matrix(0, rows=numPairs, cols=2)
-for( i in 1:s1size, check=0) {
- pre_a1 = castAsScalar(S1[1,i]);
- pre_k1 = castAsScalar(K1[1,i]);
-
- for( j in 1:s2size, check=0) {
- pre_pairID = (i-1)*s2size+j;
- pre_a2 = castAsScalar(S2[1,j]);
- pre_k2 = castAsScalar(K2[1,j]);
-
- if (pre_k1 == pre_k2) {
- if (pre_k1 == 1) {
- num_scale_scale_tests = num_scale_scale_tests + 1
- pair2row[pre_pairID,1] = num_scale_scale_tests
- } else {
- num_nominal_nominal_tests = num_nominal_nominal_tests + 1
- pair2row[pre_pairID,1] = num_nominal_nominal_tests
-
- if ( pre_k1 == 3 ) {
- num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1
- pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests
- }
- }
- }
- else {
- if (pre_k1 == 1 | pre_k2 == 1) {
- num_nominal_scale_tests = num_nominal_scale_tests + 1
- pair2row[pre_pairID,1] = num_nominal_scale_tests
- } else {
- num_nominal_nominal_tests = num_nominal_nominal_tests + 1
- pair2row[pre_pairID,1] = num_nominal_nominal_tests
- }
- }
- }
-}
-
-size_scale_scale_tests = max(num_scale_scale_tests, 1);
-size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1)
-size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1);
-size_nominal_scale_tests = max(num_nominal_scale_tests, 1);
-
-basestats = matrix(0, rows=11, cols=numPairs);
-basestats_scale_scale = matrix(0, rows=6, cols=size_scale_scale_tests)
-basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests)
-basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests)
-basestats_nominal_scale = matrix(0, rows=11, cols=size_nominal_scale_tests)
-
-
-# Compute max domain size among all categorical attributes
-# and check if these cols have been recoded
-
-debug_str = "Stopping execution of DML script due to invalid input";
-
-error_flag = FALSE;
-
-maxs = colMaxs(D);
-mins = colMins(D)
-maxDomainSize = -1.0;
-for(k in 1:ncol(K1) ) {
- type = as.scalar(K1[1,k]);
-
- if ( type > 1) {
- colID = as.scalar(S1[1,k]);
-
- colMaximum = as.scalar(maxs[1,colID]);
- if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
-
- colMinimum = as.scalar(mins[1,colID]);
- if(colMinimum < 1){
- if(type == 2)
- debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
- else
- debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
- error_flag = TRUE;
- }
- }
-}
-
-for(k in 1:ncol(K2) ) {
- type = as.scalar(K2[1,k]);
-
- if ( type > 1) {
- colID = as.scalar(S2[1,k]);
-
- colMaximum = as.scalar(maxs[1,colID]);
- if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
-
- colMinimum = as.scalar(mins[1,colID]);
- if(colMinimum < 1){
- if(type == 2)
- debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
- else
- debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
- error_flag = TRUE;
- }
- }
-}
-maxDomain = as.integer(maxDomainSize);
-
-if(error_flag) stop(debug_str);
-
-parfor( i in 1:s1size, check=0) {
- a1 = castAsScalar(S1[1,i]);
- k1 = castAsScalar(K1[1,i]);
- A1 = D[,a1];
-
- parfor( j in 1:s2size, check=0) {
- pairID = (i-1)*s2size+j;
- a2 = castAsScalar(S2[1,j]);
- k2 = castAsScalar(K2[1,j]);
- A2 = D[,a2];
-
- rowid1 = castAsScalar(pair2row[pairID, 1])
- rowid2 = castAsScalar(pair2row[pairID, 2])
-
- if (k1 == k2) {
- if (k1 == 1) {
- # scale-scale
- print("[" + i + "," + j + "] scale-scale");
- [r, cov, sigma1, sigma2] = bivar_ss(A1,A2);
-
- basestats_scale_scale[1,rowid1] = a1;
- basestats_scale_scale[2,rowid1] = a2;
- basestats_scale_scale[3,rowid1] = r;
- basestats_scale_scale[4,rowid1] = cov;
- basestats_scale_scale[5,rowid1] = sigma1;
- basestats_scale_scale[6,rowid1] = sigma2;
- } else {
- # nominal-nominal or ordinal-ordinal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain);
-
- basestats_nominal_nominal[1,rowid1] = a1;
- basestats_nominal_nominal[2,rowid1] = a2;
- basestats_nominal_nominal[3,rowid1] = chisq;
- basestats_nominal_nominal[4,rowid1] = df;
- basestats_nominal_nominal[5,rowid1] = pval;
- basestats_nominal_nominal[6,rowid1] = cramersv;
-
- if ( k1 == 3 ) {
- # ordinal-ordinal
- print("[" + i + "," + j + "] ordinal-ordinal");
- sp = bivar_oo(A1, A2, maxDomain);
-
- basestats_ordinal_ordinal[1,rowid2] = a1;
- basestats_ordinal_ordinal[2,rowid2] = a2;
- basestats_ordinal_ordinal[3,rowid2] = sp;
- }
- }
- } else {
- if (k1 == 1 | k2 == 1) {
- # Scale-nominal/ordinal
- print("[" + i + "," + j + "] scale-categorical");
-
- if ( k1 == 1 ) {
- [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain);
- } else {
- [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain);
- }
-
- basestats_nominal_scale[1,rowid1] = a1;
- basestats_nominal_scale[2,rowid1] = a2;
- basestats_nominal_scale[3,rowid1] = eta;
- basestats_nominal_scale[4,rowid1] = f;
- basestats_nominal_scale[5,rowid1] = pval;
- basestats_nominal_scale[6,rowid1] = bw_ss;
- basestats_nominal_scale[7,rowid1] = within_ss;
- basestats_nominal_scale[8,rowid1] = bw_df;
- basestats_nominal_scale[9,rowid1] = within_df;
- basestats_nominal_scale[10,rowid1] = bw_mean_square;
- basestats_nominal_scale[11,rowid1] = within_mean_square;
- } else {
- # nominal-ordinal or ordinal-nominal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain);
-
- basestats_nominal_nominal[1,rowid1] = a1;
- basestats_nominal_nominal[2,rowid1] = a2;
- basestats_nominal_nominal[3,rowid1] = chisq;
- basestats_nominal_nominal[4,rowid1] = df;
- basestats_nominal_nominal[5,rowid1] = pval;
- basestats_nominal_nominal[6,rowid1] = cramersv;
- }
- }
- }
-}
-
-if(num_scale_scale_tests == size_scale_scale_tests){
- write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats");
-}
-
-if(num_nominal_scale_tests == size_nominal_scale_tests){
- write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats");
-}
-
-if(num_nominal_nominal_tests == size_nominal_nominal_tests){
- write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats");
-}
-
-if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){
- write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats");
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) {
-
- # Contingency Table
- F = table(A, B, maxDomain, maxDomain);
- F = F[1:max(A), 1:max(B)];
-
- # Chi-Squared
- W = sum(F);
- r = rowSums(F);
- c = colSums(F);
- E = (r %*% c)/W;
- T = (F-E)^2/E;
- chi_squared = sum(T);
-
- # compute p-value
- degFreedom = (nrow(F)-1)*(ncol(F)-1);
- pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
- # Cramer's V
- R = nrow(F);
- C = ncol(F);
- q = min(R,C);
- cramers_v = sqrt(chi_squared/(W*(q-1)));
-
- # Assign return values
- chisq = chi_squared;
- df = as.double(degFreedom);
- pval = pValue;
- cramersv = cramers_v;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
-
- # Unweighted co-variance
- covXY = cov(X,Y);
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- W = nrow(X);
- m2X = moment(X,2);
- m2Y = moment(Y,2);
- sigmaX = sqrt(m2X * (W/(W-1.0)) );
- sigmaY = sqrt(m2Y * (W/(W-1.0)) );
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY);
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain)
- return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) {
-
- # mean and variance in target variable
- W = nrow(A);
- my = mean(Y);
- varY = moment(Y,2) * W/(W-1.0)
-
- # category-wise (frequencies, means, variances)
- CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain);
- CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain);
- CVars = aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain);
-
- # number of categories
- R = nrow(CFreqs);
-
- Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
-
- bw_ss = sum( (CFreqs*(CMeans-my)^2) );
- bw_df = as.double(R-1);
- bw_mean_square = bw_ss/bw_df;
-
- within_ss = sum( (CFreqs-1)*CVars );
- within_df = as.double(W-R);
- within_mean_square = within_ss/within_df;
-
- AnovaF = bw_mean_square/within_mean_square;
-
- pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE)
-}
-
-
-# -----------------------------------------------------------------------------------------------------------
-# Function to compute ranks
-# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
-computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
- Ranks = cumsum(X) - X/2 + 1/2;
-}
-
-#-------------------------------------------------------------------------
-
-bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) {
-
- # compute contingency table
- F = table(A, B, maxDomain, maxDomain);
- F = F[1:max(A), 1:max(B)];
-
- catA = nrow(F); # number of categories in A
- catB = ncol(F); # number of categories in B
-
- # compute category-wise counts for both the attributes
- R = rowSums(F);
- S = colSums(F);
-
- # compute scores, both are column vectors
- [C] = computeRanks(R);
- meanX = mean(C,R);
-
- columnS = t(S);
- [D] = computeRanks(columnS);
-
- # scores (C,D) are individual values, and counts (R,S) act as weights
- meanY = mean(D,columnS);
-
- W = sum(F); # total weight, or total #cases
- varX = moment(C,R,2)*(W/(W-1.0));
- varY = moment(D,columnS,2)*(W/(W-1.0));
- covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
-
- sp = covXY/(sqrt(varX)*sqrt(varY));
-}
-
-# -----------------------------------------------------------------------------------------------------------
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+#
+# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
+# Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n}
+# compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
+#
+# Six inputs:
+# 1) X - input data
+# 2) index1 - First attribute set {A_11, A_12, ... A_1m}
+# 3) index2 - Second attribute set {A_21, A_22, ... A_2n}
+# 4) types1 - kind for attributes in S1
+# 5) types2 - kind for attributes in S2
+# kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
+#
+# One output:
+# 6) output directory in which following (maximum of) four statistics files are created
+# + bivar.scale.scale.stats - matrix containing scale-scale correlations
+# + bivar.nominal.nominal.stats -
+# + bivar.nominal.scale.stats -
+# + bivar.ordinal.ordinal.stats -
+#
+# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data>
+# index1=<Feature Index Set 1>
+# index2=<Feature Index Set 2>
+# types1=<Feature Types 1>
+# types2=<Feature Types 2>
+# OUTDIR=<Output Location>
+
+D = read($X); # input data set
+S1 = read($index1); # attribute set 1
+S2 = read($index2); # attribute set 2
+K1 = read($types1); # kind for attributes in S1
+K2 = read($types2); # kind for attributes in S2
+
+s1size = ncol(S1);
+s2size = ncol(S2);
+numPairs = s1size * s2size;
+
+#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho
+# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test
+
+num_scale_scale_tests = 0
+num_nominal_nominal_tests = 0
+num_ordinal_ordinal_tests = 0
+num_nominal_scale_tests = 0
+
+pair2row = matrix(0, rows=numPairs, cols=2)
+for( i in 1:s1size, check=0) {
+ pre_a1 = castAsScalar(S1[1,i]);
+ pre_k1 = castAsScalar(K1[1,i]);
+
+ for( j in 1:s2size, check=0) {
+ pre_pairID = (i-1)*s2size+j;
+ pre_a2 = castAsScalar(S2[1,j]);
+ pre_k2 = castAsScalar(K2[1,j]);
+
+ if (pre_k1 == pre_k2) {
+ if (pre_k1 == 1) {
+ num_scale_scale_tests = num_scale_scale_tests + 1
+ pair2row[pre_pairID,1] = num_scale_scale_tests
+ } else {
+ num_nominal_nominal_tests = num_nominal_nominal_tests + 1
+ pair2row[pre_pairID,1] = num_nominal_nominal_tests
+
+ if ( pre_k1 == 3 ) {
+ num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1
+ pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests
+ }
+ }
+ }
+ else {
+ if (pre_k1 == 1 | pre_k2 == 1) {
+ num_nominal_scale_tests = num_nominal_scale_tests + 1
+ pair2row[pre_pairID,1] = num_nominal_scale_tests
+ } else {
+ num_nominal_nominal_tests = num_nominal_nominal_tests + 1
+ pair2row[pre_pairID,1] = num_nominal_nominal_tests
+ }
+ }
+ }
+}
+
+size_scale_scale_tests = max(num_scale_scale_tests, 1);
+size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1)
+size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1);
+size_nominal_scale_tests = max(num_nominal_scale_tests, 1);
+
+basestats = matrix(0, rows=11, cols=numPairs);
+basestats_scale_scale = matrix(0, rows=6, cols=size_scale_scale_tests)
+basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests)
+basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests)
+basestats_nominal_scale = matrix(0, rows=11, cols=size_nominal_scale_tests)
+
+
+# Compute max domain size among all categorical attributes
+# and check if these cols have been recoded
+
+debug_str = "Stopping execution of DML script due to invalid input";
+
+error_flag = FALSE;
+
+maxs = colMaxs(D);
+mins = colMins(D)
+maxDomainSize = -1.0;
+for(k in 1:ncol(K1) ) {
+ type = as.scalar(K1[1,k]);
+
+ if ( type > 1) {
+ colID = as.scalar(S1[1,k]);
+
+ colMaximum = as.scalar(maxs[1,colID]);
+ if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
+
+ colMinimum = as.scalar(mins[1,colID]);
+ if(colMinimum < 1){
+ if(type == 2)
+ debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
+ else
+ debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
+ error_flag = TRUE;
+ }
+ }
+}
+
+for(k in 1:ncol(K2) ) {
+ type = as.scalar(K2[1,k]);
+
+ if ( type > 1) {
+ colID = as.scalar(S2[1,k]);
+
+ colMaximum = as.scalar(maxs[1,colID]);
+ if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
+
+ colMinimum = as.scalar(mins[1,colID]);
+ if(colMinimum < 1){
+ if(type == 2)
+ debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
+ else
+ debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
+ error_flag = TRUE;
+ }
+ }
+}
+maxDomain = as.integer(maxDomainSize);
+
+if(error_flag) stop(debug_str);
+
+parfor( i in 1:s1size, check=0) {
+ a1 = castAsScalar(S1[1,i]);
+ k1 = castAsScalar(K1[1,i]);
+ A1 = D[,a1];
+
+ parfor( j in 1:s2size, check=0) {
+ pairID = (i-1)*s2size+j;
+ a2 = castAsScalar(S2[1,j]);
+ k2 = castAsScalar(K2[1,j]);
+ A2 = D[,a2];
+
+ rowid1 = castAsScalar(pair2row[pairID, 1])
+ rowid2 = castAsScalar(pair2row[pairID, 2])
+
+ if (k1 == k2) {
+ if (k1 == 1) {
+ # scale-scale
+ print("[" + i + "," + j + "] scale-scale");
+ [r, cov, sigma1, sigma2] = bivar_ss(A1,A2);
+
+ basestats_scale_scale[1,rowid1] = a1;
+ basestats_scale_scale[2,rowid1] = a2;
+ basestats_scale_scale[3,rowid1] = r;
+ basestats_scale_scale[4,rowid1] = cov;
+ basestats_scale_scale[5,rowid1] = sigma1;
+ basestats_scale_scale[6,rowid1] = sigma2;
+ } else {
+ # nominal-nominal or ordinal-ordinal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain);
+
+ basestats_nominal_nominal[1,rowid1] = a1;
+ basestats_nominal_nominal[2,rowid1] = a2;
+ basestats_nominal_nominal[3,rowid1] = chisq;
+ basestats_nominal_nominal[4,rowid1] = df;
+ basestats_nominal_nominal[5,rowid1] = pval;
+ basestats_nominal_nominal[6,rowid1] = cramersv;
+
+ if ( k1 == 3 ) {
+ # ordinal-ordinal
+ print("[" + i + "," + j + "] ordinal-ordinal");
+ sp = bivar_oo(A1, A2, maxDomain);
+
+ basestats_ordinal_ordinal[1,rowid2] = a1;
+ basestats_ordinal_ordinal[2,rowid2] = a2;
+ basestats_ordinal_ordinal[3,rowid2] = sp;
+ }
+ }
+ } else {
+ if (k1 == 1 | k2 == 1) {
+ # Scale-nominal/ordinal
+ print("[" + i + "," + j + "] scale-categorical");
+
+ if ( k1 == 1 ) {
+ [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain);
+ } else {
+ [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain);
+ }
+
+ basestats_nominal_scale[1,rowid1] = a1;
+ basestats_nominal_scale[2,rowid1] = a2;
+ basestats_nominal_scale[3,rowid1] = eta;
+ basestats_nominal_scale[4,rowid1] = f;
+ basestats_nominal_scale[5,rowid1] = pval;
+ basestats_nominal_scale[6,rowid1] = bw_ss;
+ basestats_nominal_scale[7,rowid1] = within_ss;
+ basestats_nominal_scale[8,rowid1] = bw_df;
+ basestats_nominal_scale[9,rowid1] = within_df;
+ basestats_nominal_scale[10,rowid1] = bw_mean_square;
+ basestats_nominal_scale[11,rowid1] = within_mean_square;
+ } else {
+ # nominal-ordinal or ordinal-nominal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain);
+
+ basestats_nominal_nominal[1,rowid1] = a1;
+ basestats_nominal_nominal[2,rowid1] = a2;
+ basestats_nominal_nominal[3,rowid1] = chisq;
+ basestats_nominal_nominal[4,rowid1] = df;
+ basestats_nominal_nominal[5,rowid1] = pval;
+ basestats_nominal_nominal[6,rowid1] = cramersv;
+ }
+ }
+ }
+}
+
+if(num_scale_scale_tests == size_scale_scale_tests){
+ write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats");
+}
+
+if(num_nominal_scale_tests == size_nominal_scale_tests){
+ write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats");
+}
+
+if(num_nominal_nominal_tests == size_nominal_nominal_tests){
+ write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats");
+}
+
+if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){
+ write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats");
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) {
+
+ # Contingency Table
+ F = table(A, B, maxDomain, maxDomain);
+ F = F[1:max(A), 1:max(B)];
+
+ # Chi-Squared
+ W = sum(F);
+ r = rowSums(F);
+ c = colSums(F);
+ E = (r %*% c)/W;
+ T = (F-E)^2/E;
+ chi_squared = sum(T);
+
+ # compute p-value
+ degFreedom = (nrow(F)-1)*(ncol(F)-1);
+ pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+ # Cramer's V
+ R = nrow(F);
+ C = ncol(F);
+ q = min(R,C);
+ cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+ # Assign return values
+ chisq = chi_squared;
+ df = as.double(degFreedom);
+ pval = pValue;
+ cramersv = cramers_v;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
+
+ # Unweighted co-variance
+ covXY = cov(X,Y);
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ W = nrow(X);
+ m2X = moment(X,2);
+ m2Y = moment(Y,2);
+ sigmaX = sqrt(m2X * (W/(W-1.0)) );
+ sigmaY = sqrt(m2Y * (W/(W-1.0)) );
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY);
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain)
+ return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) {
+
+ # mean and variance in target variable
+ W = nrow(A);
+ my = mean(Y);
+ varY = moment(Y,2) * W/(W-1.0)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain);
+ CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain);
+ CVars = aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain);
+
+ # number of categories
+ R = nrow(CFreqs);
+
+ Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
+
+ bw_ss = sum( (CFreqs*(CMeans-my)^2) );
+ bw_df = as.double(R-1);
+ bw_mean_square = bw_ss/bw_df;
+
+ within_ss = sum( (CFreqs-1)*CVars );
+ within_df = as.double(W-R);
+ within_mean_square = within_ss/within_df;
+
+ AnovaF = bw_mean_square/within_mean_square;
+
+ pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE)
+}
+
+
+# -----------------------------------------------------------------------------------------------------------
+# Function to compute ranks
+# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
+computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
+ Ranks = cumsum(X) - X/2 + 1/2;
+}
+
+#-------------------------------------------------------------------------
+
+bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) {
+
+ # compute contingency table
+ F = table(A, B, maxDomain, maxDomain);
+ F = F[1:max(A), 1:max(B)];
+
+ catA = nrow(F); # number of categories in A
+ catB = ncol(F); # number of categories in B
+
+ # compute category-wise counts for both the attributes
+ R = rowSums(F);
+ S = colSums(F);
+
+ # compute scores, both are column vectors
+ [C] = computeRanks(R);
+ meanX = mean(C,R);
+
+ columnS = t(S);
+ [D] = computeRanks(columnS);
+
+ # scores (C,D) are individual values, and counts (R,S) act as weights
+ meanY = mean(D,columnS);
+
+ W = sum(F); # total weight, or total #cases
+ varX = moment(C,R,2)*(W/(W-1.0));
+ varY = moment(D,columnS,2)*(W/(W-1.0));
+ covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
+
+ sp = covXY/(sqrt(varX)*sqrt(varY));
+}
+
+# -----------------------------------------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/decision-tree-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/decision-tree-predict.dml b/scripts/algorithms/decision-tree-predict.dml
index 9e01adb..3447da6 100644
--- a/scripts/algorithms/decision-tree-predict.dml
+++ b/scripts/algorithms/decision-tree-predict.dml
@@ -1,142 +1,142 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET.
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded
-# Y String " " Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
-# R String " " Location to read matrix R which for each feature in X contains the following information
-# - R[,1]: column ids
-# - R[,2]: start indices
-# - R[,3]: end indices
-# If R is not provided by default all variables are assumed to be scale
-# M String --- Location to read matrix M containing the learned tree in the following format
-# - M[1,j]: id of node j (in a complete binary tree)
-# - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-# - M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
-# - M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
-# otherwise the label that leaf node j is supposed to predict
-# - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
-# stored in rows 6,7,... if j is categorical
-# If j is a leaf node: number of misclassified samples reaching at node j
-# - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j]
-# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,...
-# depict the value subset chosen for j
-# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-# P String --- Location to store the label predictions for X
-# A String " " Location to write the test accuracy (%) for the prediction if requested
-# CM String " " Location to write the confusion matrix if requested
-# fmt String "text" The output format of the output, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- Matrix Y containing the predicted labels for X
-# 2- Test accuracy if requested
-# 3- Confusion matrix C if requested
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
-# A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv
-
-fileX = $X;
-fileM = $M;
-fileP = $P;
-fileY = ifdef ($Y, " ");
-fileR = ifdef ($R, " ");
-fileCM = ifdef ($CM, " ");
-fileA = ifdef ($A, " ");
-fmtO = ifdef ($fmt, "text");
-X_test = read (fileX);
-M = read (fileM);
-
-num_records = nrow (X_test);
-Y_predicted = matrix (0, rows = num_records, cols = 1);
-
-R_cat = matrix (0, rows = 1, cols = 1);
-R_scale = matrix (0, rows = 1, cols = 1);
-
-if (fileR != " ") {
- R = read (fileR);
- dummy_coded = ppred (R[,2], R[,3], "!=");
- R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
- R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
-} else { # only scale features available
- R_scale = seq (1, ncol (X_test));
-}
-
-parfor (i in 1:num_records, check = 0) {
- cur_sample = X_test[i,];
- cur_node_pos = 1;
- label_found = FALSE;
- while (!label_found) {
- cur_feature = as.scalar (M[3,cur_node_pos]);
- type_label = as.scalar (M[4,cur_node_pos]);
- if (cur_feature == 0) { # leaf node
- label_found = TRUE;
- Y_predicted[i,] = type_label;
- } else {
- # determine type: 1 for scale, 2 for categorical
- if (type_label == 1) { # scale feature
- cur_start_ind = as.scalar (R_scale[cur_feature,]);
- cur_value = as.scalar (cur_sample[,cur_start_ind]);
- cur_split = as.scalar (M[6,cur_node_pos]);
- if (cur_value < cur_split) { # go to left branch
- cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
- } else { # go to right branch
- cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
- }
- } else if (type_label == 2) { # categorical feature
- cur_start_ind = as.scalar (R_cat[cur_feature,1]);
- cur_end_ind = as.scalar (R_cat[cur_feature,2]);
- cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]);
- cur_offset = as.scalar (M[5,cur_node_pos]);
- value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "=="));
- if (value_found) { # go to left branch
- cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
- } else { # go to right branch
- cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
- }
-}}}}
-
-write (Y_predicted, fileP, format = fmtO);
-
-if (fileY != " ") {
- Y_test = read (fileY);
- num_classes = ncol (Y_test);
- Y_test = rowSums (Y_test * t (seq (1, num_classes)));
- result = ppred (Y_test, Y_predicted, "==");
- result = sum (result);
- accuracy = result / num_records * 100;
- acc_str = "Accuracy (%): " + accuracy;
- if (fileA != " ") {
- write (acc_str, fileA, format = fmtO);
- } else {
- print (acc_str);
- }
- if (fileCM != " ") {
- confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes)
- write(confusion_mat, fileCM, format = fmtO)
- }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded
+# Y String " " Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
+# R String " " Location to read matrix R which for each feature in X contains the following information
+# - R[,1]: column ids
+# - R[,2]: start indices
+# - R[,3]: end indices
+# If R is not provided by default all variables are assumed to be scale
+# M String --- Location to read matrix M containing the learned tree in the following format
+# - M[1,j]: id of node j (in a complete binary tree)
+# - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
+# - M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
+# - M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
+# otherwise the label that leaf node j is supposed to predict
+# - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
+# stored in rows 6,7,... if j is categorical
+# If j is a leaf node: number of misclassified samples reaching at node j
+# - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j]
+# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,...
+# depict the value subset chosen for j
+# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+# P String --- Location to store the label predictions for X
+# A String " " Location to write the test accuracy (%) for the prediction if requested
+# CM String " " Location to write the confusion matrix if requested
+# fmt String "text" The output format of the output, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- Matrix Y containing the predicted labels for X
+# 2- Test accuracy if requested
+# 3- Confusion matrix C if requested
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
+# A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv
+
+fileX = $X;
+fileM = $M;
+fileP = $P;
+fileY = ifdef ($Y, " ");
+fileR = ifdef ($R, " ");
+fileCM = ifdef ($CM, " ");
+fileA = ifdef ($A, " ");
+fmtO = ifdef ($fmt, "text");
+X_test = read (fileX);
+M = read (fileM);
+
+num_records = nrow (X_test);
+Y_predicted = matrix (0, rows = num_records, cols = 1);
+
+R_cat = matrix (0, rows = 1, cols = 1);
+R_scale = matrix (0, rows = 1, cols = 1);
+
+if (fileR != " ") {
+ R = read (fileR);
+ dummy_coded = ppred (R[,2], R[,3], "!=");
+ R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
+ R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
+} else { # only scale features available
+ R_scale = seq (1, ncol (X_test));
+}
+
+parfor (i in 1:num_records, check = 0) {
+ cur_sample = X_test[i,];
+ cur_node_pos = 1;
+ label_found = FALSE;
+ while (!label_found) {
+ cur_feature = as.scalar (M[3,cur_node_pos]);
+ type_label = as.scalar (M[4,cur_node_pos]);
+ if (cur_feature == 0) { # leaf node
+ label_found = TRUE;
+ Y_predicted[i,] = type_label;
+ } else {
+ # determine type: 1 for scale, 2 for categorical
+ if (type_label == 1) { # scale feature
+ cur_start_ind = as.scalar (R_scale[cur_feature,]);
+ cur_value = as.scalar (cur_sample[,cur_start_ind]);
+ cur_split = as.scalar (M[6,cur_node_pos]);
+ if (cur_value < cur_split) { # go to left branch
+ cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
+ } else { # go to right branch
+ cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
+ }
+ } else if (type_label == 2) { # categorical feature
+ cur_start_ind = as.scalar (R_cat[cur_feature,1]);
+ cur_end_ind = as.scalar (R_cat[cur_feature,2]);
+ cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]);
+ cur_offset = as.scalar (M[5,cur_node_pos]);
+ value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "=="));
+ if (value_found) { # go to left branch
+ cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
+ } else { # go to right branch
+ cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
+ }
+}}}}
+
+write (Y_predicted, fileP, format = fmtO);
+
+if (fileY != " ") {
+ Y_test = read (fileY);
+ num_classes = ncol (Y_test);
+ Y_test = rowSums (Y_test * t (seq (1, num_classes)));
+ result = ppred (Y_test, Y_predicted, "==");
+ result = sum (result);
+ accuracy = result / num_records * 100;
+ acc_str = "Accuracy (%): " + accuracy;
+ if (fileA != " ") {
+ write (acc_str, fileA, format = fmtO);
+ } else {
+ print (acc_str);
+ }
+ if (fileCM != " ") {
+ confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes)
+ write(confusion_mat, fileCM, format = fmtO)
+ }
+}