You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/26 02:13:07 UTC

[43/55] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/StepLinearRegDS.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/StepLinearRegDS.dml b/scripts/algorithms/StepLinearRegDS.dml
index afd94ed..953402f 100644
--- a/scripts/algorithms/StepLinearRegDS.dml
+++ b/scripts/algorithms/StepLinearRegDS.dml
@@ -1,388 +1,388 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
-# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
-#
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME    TYPE    DEFAULT    MEANING
-# --------------------------------------------------------------------------------------------
-# X       String   	---      Location (on HDFS) to read the matrix X of feature vectors
-# Y       String   	---      Location (on HDFS) to read the 1-column matrix Y of response values
-# B       String   	---      Location to store estimated regression parameters (the betas)
-# S       String    ---      Location to write the selected features ordered as computed by the algorithm
-# O       String   	" "      Location to write the printed statistics; by default is standard output
-# icpt    Int        0       Intercept presence, shifting and rescaling the columns of X:
-#                            0 = no intercept, no shifting, no rescaling;
-#                            1 = add intercept, but neither shift nor rescale X;
-#                            2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# thr     Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
-#                            no further features are being checked and the algorithm stops 
-# fmt     String   "text"    Matrix output format for B (the betas) only, usually "text" or "csv"
-# --------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
-#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
-# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-#                        Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME                  MEANING
-# -------------------------------------------------------------------------------------
-# AVG_TOT_Y             Average of the response value Y
-# STDEV_TOT_Y           Standard Deviation of the response value Y
-# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
-# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
-# PLAIN_R2              Plain R^2 of residual with bias included vs. total average
-# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
-# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total average
-# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
-# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero constant
-# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
-# -------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-# If the best AIC is achieved without any features the matrix of selected features contains 0.  
-# Moreover, in this case no further statistics will be produced  
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
-#     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileS = $S;
-
-# currently only the forward selection strategy in supported: start from one feature and iteratively add 
-# features until AIC improves
-dir = "forward";
-
-fmt  = ifdef ($fmt, "text");
-intercept_status = ifdef ($icpt, 0);   
-thr = ifdef ($thr, 0.01);  
-
-print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X_orig = read (fileX);
-y = read (fileY);
-
-n = nrow (X_orig);
-m_orig = ncol (X_orig);
-
-# BEGIN STEPWISE LINEAR REGRESSION
-
-if (dir == "forward") {  
-	
-	continue = TRUE;
-	columns_fixed = matrix (0, rows = 1, cols = m_orig);
-	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
-	
-	# X_global stores the best model found at each step 
-	X_global = matrix (0, rows = n, cols = 1);
-	
-	if (intercept_status == 1 | intercept_status == 2) {
-		beta = mean (y);
-		AIC_best = 2 + n * log(sum((beta - y)^2) / n);
-	} else {
-		beta = 0;
-		AIC_best = n * log(sum(y^2) / n);
-	}
-	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
-	print ("Best AIC without any features: " + AIC_best);
-	
-	# First pass to examine single features
-	parfor (i in 1:m_orig) { 	
-		[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");					
-		AICs[1,i] = AIC_1;
-	}
-
-	# Determine the best AIC 
-	column_best = 0;	
-	for (k in 1:m_orig) {
-		AIC_cur = as.scalar (AICs[1,k]);
-		if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
-			column_best = k;
-			AIC_best = as.scalar(AICs[1,k]);
-		}
-	}
-	
-	if (column_best == 0) {
-		print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
-		S = matrix (0, rows=1, cols=1);
-		if (intercept_status == 0) {
-			B = matrix (beta, rows = m_orig, cols = 1);
-		} else {
-			B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
-			B_tmp[m_orig + 1,] = beta;
-			B = B_tmp;
-		}
-		write (S, fileS, format=fmt);
-		write (B, fileB, format=fmt);
-		stop ("");
-	}
-	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);	
-	columns_fixed[1,column_best] = 1;
-	columns_fixed_ordered[1,1] = column_best;
-	X_global = X_orig[,column_best];		
-		
-	while (continue) {
-		# Subsequent passes over the features
-		parfor (i in 1:m_orig) { 
-			if (as.scalar(columns_fixed[1,i]) == 0) {	
-			
-				# Construct the feature matrix
-				X = append (X_global, X_orig[,i]);
-				
-				[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
-				AICs[1,i] = AIC_2;
-			}	
-		}
-	
-		# Determine the best AIC
-		for (k in 1:m_orig) {
-			AIC_cur = as.scalar (AICs[1,k]);
-			if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
-				column_best = k;
-				AIC_best = as.scalar(AICs[1,k]);
-			}
-		}
-				
-		# Append best found features (i.e., columns) to X_global
-		if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
-			print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
-			columns_fixed[1,column_best] = 1;
-			columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
-			if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
-				X_global = append (X_global, X_orig[,column_best]);
-				continue = FALSE;
-			} else {
-				X_global = append (X_global, X_orig[,column_best]);
-			}
-		} else {
-			continue = FALSE;
-		}
-	}
-	
-	# run linear regression with selected set of features
-	print ("Running linear regression with selected features...");
-	[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB); 
-	
-} else {
-	stop ("Currently only forward selection strategy is supported!");
-} 
-
-
-/*
-* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
-* It also outputs the AIC of the computed model.  
-*/
-linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
-		
-	intercept_status = ifdef ($icpt, 0); 		
-	n = nrow (X);	
-	m = ncol (X);
-	
-	# Introduce the intercept, shift and rescale the columns of X if needed
-	if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
-		ones_n = matrix (1, rows = n, cols = 1);
-		X = append (X, ones_n);
-		m = m - 1;
-	}
-	m_ext = ncol(X);
-	
-	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
-							     # Important assumption: X [, m_ext] = ones_n
-		avg_X_cols = t(colSums(X)) / n;
-		var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
-		is_unsafe = ppred (var_X_cols, 0.0, "<=");
-		scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-		scale_X [m_ext, 1] = 1;
-		shift_X = - avg_X_cols * scale_X;
-		shift_X [m_ext, 1] = 0;
-	} else {
-		scale_X = matrix (1, rows = m_ext, cols = 1);
-		shift_X = matrix (0, rows = m_ext, cols = 1);
-	}
-
-	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-
-	A = t(X) %*% X;
-	b = t(X) %*% y;
-	if (intercept_status == 2) {
-		A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
-		A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-		b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
-	}
-
-	beta_unscaled = solve (A, b);
-	
-	# END THE DIRECT SOLVE ALGORITHM
-
-	if (intercept_status == 2) {
-		beta = scale_X * beta_unscaled;
-		beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-	} else {
-		beta = beta_unscaled;
-	}
-	
-	# COMPUTE AIC
-	y_residual = y - X %*% beta;
-	ss_res = sum (y_residual ^ 2);
-	eq_deg_of_freedom = m_ext;
-	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
-	
-	if (fileB != " ") {
-	
-		fileO = ifdef ($O, " ");
-		fileS = $S;
-		
-		print ("Computing the statistics...");
-		avg_tot = sum (y) / n;
-		ss_tot = sum (y ^ 2);
-		ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-		var_tot = ss_avg_tot / (n - 1);
-#		y_residual = y - X %*% beta;
-		avg_res = sum (y_residual) / n;
-#		ss_res = sum (y_residual ^ 2);
-		ss_avg_res = ss_res - n * avg_res ^ 2;
-
-		plain_R2 = 1 - ss_res / ss_avg_tot;
-		if (n > m_ext) {
-			dispersion  = ss_res / (n - m_ext);
-			adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-		} else {
-			dispersion  = 0.0 / 0.0;
-			adjusted_R2 = 0.0 / 0.0;
-		}
-
-		plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-		deg_freedom = n - m - 1;
-		if (deg_freedom > 0) {
-			var_res = ss_avg_res / deg_freedom;
-			adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
-		} else {
-			var_res = 0.0 / 0.0;
-			adjusted_R2_nobias = 0.0 / 0.0;
-			print ("Warning: zero or negative number of degrees of freedom.");
-		}
-
-		plain_R2_vs_0 = 1 - ss_res / ss_tot;
-		if (n > m) {
-			adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
-		} else {
-			adjusted_R2_vs_0 = 0.0 / 0.0;
-		}
-
-		str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
-		str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
-		str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
-		str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
-		str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
-		str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 of residual with bias included vs. total average
-		str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
-		str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 of residual with bias subtracted vs. total average
-		str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
-		if (intercept_status == 0) {
-			str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 of residual with bias included vs. zero constant
-			str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant
-		}
-
-		if (fileO != " ") {
-			write (str, fileO);
-		} else {
-			print (str);
-		}
-
-		# Prepare the output matrix
-		print ("Writing the output matrix...");
-		if (intercept_status == 2) {
-			beta_out = append (beta, beta_unscaled);
-		} else {
-			beta_out = beta;
-		}
-		
-		# Output which features give the best AIC and are being used for linear regression 
-		write (Selected, fileS, format=fmt);
-		
-		no_selected = ncol (Selected);
-		max_selected = max (Selected);
-		last = max_selected + 1;	
-		
-		if (intercept_status != 0) {
-		
-			Selected_ext = append (Selected, as.matrix (last));			
-			P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 
-
-			if (intercept_status == 2) {
-			
-				P1_beta = P1 * beta;
-				P2_beta = colSums (P1_beta);
-				P1_beta_unscaled = P1 * beta_unscaled;
-				P2_beta_unscaled = colSums(P1_beta_unscaled);
-				
-				if (max_selected < m_orig) {
-					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
-					P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
-					
-					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1]; 
-					P2_beta[1, max_selected + 1] = 0;
-				
-					P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1]; 
-					P2_beta_unscaled[1, max_selected + 1] = 0;
-				}
-				beta_out = append (t(P2_beta), t(P2_beta_unscaled));
-				
-			} else {
-			
-				P1_beta = P1 * beta;
-				P2_beta = colSums (P1_beta);
-				
-				if (max_selected < m_orig) {
-					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
-					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ; 
-					P2_beta[1, max_selected + 1] = 0;
-				}
-				beta_out = t(P2_beta);
-				
-			}
-		} else {
-		
-			P1 = table (seq (1, no_selected), t(Selected)); 
-			P1_beta = P1 * beta;
-			P2_beta = colSums (P1_beta);	
-
-			if (max_selected < m_orig) {
-				P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
-			}		
-
-			beta_out = t(P2_beta);	
-		}
-		
-		write ( beta_out, fileB, format=fmt );		
-	}
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
+# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME    TYPE    DEFAULT    MEANING
+# --------------------------------------------------------------------------------------------
+# X       String   	---      Location (on HDFS) to read the matrix X of feature vectors
+# Y       String   	---      Location (on HDFS) to read the 1-column matrix Y of response values
+# B       String   	---      Location to store estimated regression parameters (the betas)
+# S       String    ---      Location to write the selected features ordered as computed by the algorithm
+# O       String   	" "      Location to write the printed statistics; by default is standard output
+# icpt    Int        0       Intercept presence, shifting and rescaling the columns of X:
+#                            0 = no intercept, no shifting, no rescaling;
+#                            1 = add intercept, but neither shift nor rescale X;
+#                            2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# thr     Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
+#                            no further features are being checked and the algorithm stops 
+# fmt     String   "text"    Matrix output format for B (the betas) only, usually "text" or "csv"
+# --------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
+#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
+# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+#                        Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME                  MEANING
+# -------------------------------------------------------------------------------------
+# AVG_TOT_Y             Average of the response value Y
+# STDEV_TOT_Y           Standard Deviation of the response value Y
+# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
+# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
+# PLAIN_R2              Plain R^2 of residual with bias included vs. total average
+# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
+# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total average
+# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
+# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero constant
+# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
+# -------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+# If the best AIC is achieved without any features the matrix of selected features contains 0.  
+# Moreover, in this case no further statistics will be produced  
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
+#     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileS = $S;
+
+# currently only the forward selection strategy in supported: start from one feature and iteratively add 
+# features until AIC improves
+dir = "forward";
+
+fmt  = ifdef ($fmt, "text");
+intercept_status = ifdef ($icpt, 0);   
+thr = ifdef ($thr, 0.01);  
+
+print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X_orig = read (fileX);
+y = read (fileY);
+
+n = nrow (X_orig);
+m_orig = ncol (X_orig);
+
+# BEGIN STEPWISE LINEAR REGRESSION
+
+if (dir == "forward") {  
+	
+	continue = TRUE;
+	columns_fixed = matrix (0, rows = 1, cols = m_orig);
+	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
+	
+	# X_global stores the best model found at each step 
+	X_global = matrix (0, rows = n, cols = 1);
+	
+	if (intercept_status == 1 | intercept_status == 2) {
+		beta = mean (y);
+		AIC_best = 2 + n * log(sum((beta - y)^2) / n);
+	} else {
+		beta = 0;
+		AIC_best = n * log(sum(y^2) / n);
+	}
+	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
+	print ("Best AIC without any features: " + AIC_best);
+	
+	# First pass to examine single features
+	parfor (i in 1:m_orig) { 	
+		[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");					
+		AICs[1,i] = AIC_1;
+	}
+
+	# Determine the best AIC 
+	column_best = 0;	
+	for (k in 1:m_orig) {
+		AIC_cur = as.scalar (AICs[1,k]);
+		if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
+			column_best = k;
+			AIC_best = as.scalar(AICs[1,k]);
+		}
+	}
+	
+	if (column_best == 0) {
+		print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
+		S = matrix (0, rows=1, cols=1);
+		if (intercept_status == 0) {
+			B = matrix (beta, rows = m_orig, cols = 1);
+		} else {
+			B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
+			B_tmp[m_orig + 1,] = beta;
+			B = B_tmp;
+		}
+		write (S, fileS, format=fmt);
+		write (B, fileB, format=fmt);
+		stop ("");
+	}
+	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);	
+	columns_fixed[1,column_best] = 1;
+	columns_fixed_ordered[1,1] = column_best;
+	X_global = X_orig[,column_best];		
+		
+	while (continue) {
+		# Subsequent passes over the features
+		parfor (i in 1:m_orig) { 
+			if (as.scalar(columns_fixed[1,i]) == 0) {	
+			
+				# Construct the feature matrix
+				X = append (X_global, X_orig[,i]);
+				
+				[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
+				AICs[1,i] = AIC_2;
+			}	
+		}
+	
+		# Determine the best AIC
+		for (k in 1:m_orig) {
+			AIC_cur = as.scalar (AICs[1,k]);
+			if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
+				column_best = k;
+				AIC_best = as.scalar(AICs[1,k]);
+			}
+		}
+				
+		# Append best found features (i.e., columns) to X_global
+		if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
+			print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+			columns_fixed[1,column_best] = 1;
+			columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
+			if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
+				X_global = append (X_global, X_orig[,column_best]);
+				continue = FALSE;
+			} else {
+				X_global = append (X_global, X_orig[,column_best]);
+			}
+		} else {
+			continue = FALSE;
+		}
+	}
+	
+	# run linear regression with selected set of features
+	print ("Running linear regression with selected features...");
+	[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB); 
+	
+} else {
+	stop ("Currently only forward selection strategy is supported!");
+} 
+
+
+/*
+* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
+* It also outputs the AIC of the computed model.  
+*/
+linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
+		
+	intercept_status = ifdef ($icpt, 0); 		
+	n = nrow (X);	
+	m = ncol (X);
+	
+	# Introduce the intercept, shift and rescale the columns of X if needed
+	if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
+		ones_n = matrix (1, rows = n, cols = 1);
+		X = append (X, ones_n);
+		m = m - 1;
+	}
+	m_ext = ncol(X);
+	
+	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
+							     # Important assumption: X [, m_ext] = ones_n
+		avg_X_cols = t(colSums(X)) / n;
+		var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+		is_unsafe = ppred (var_X_cols, 0.0, "<=");
+		scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+		scale_X [m_ext, 1] = 1;
+		shift_X = - avg_X_cols * scale_X;
+		shift_X [m_ext, 1] = 0;
+	} else {
+		scale_X = matrix (1, rows = m_ext, cols = 1);
+		shift_X = matrix (0, rows = m_ext, cols = 1);
+	}
+
+	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
+
+	A = t(X) %*% X;
+	b = t(X) %*% y;
+	if (intercept_status == 2) {
+		A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
+		A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+		b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
+	}
+
+	beta_unscaled = solve (A, b);
+	
+	# END THE DIRECT SOLVE ALGORITHM
+
+	if (intercept_status == 2) {
+		beta = scale_X * beta_unscaled;
+		beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+	} else {
+		beta = beta_unscaled;
+	}
+	
+	# COMPUTE AIC
+	y_residual = y - X %*% beta;
+	ss_res = sum (y_residual ^ 2);
+	eq_deg_of_freedom = m_ext;
+	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
+	
+	if (fileB != " ") {
+	
+		fileO = ifdef ($O, " ");
+		fileS = $S;
+		
+		print ("Computing the statistics...");
+		avg_tot = sum (y) / n;
+		ss_tot = sum (y ^ 2);
+		ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+		var_tot = ss_avg_tot / (n - 1);
+#		y_residual = y - X %*% beta;
+		avg_res = sum (y_residual) / n;
+#		ss_res = sum (y_residual ^ 2);
+		ss_avg_res = ss_res - n * avg_res ^ 2;
+
+		plain_R2 = 1 - ss_res / ss_avg_tot;
+		if (n > m_ext) {
+			dispersion  = ss_res / (n - m_ext);
+			adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+		} else {
+			dispersion  = 0.0 / 0.0;
+			adjusted_R2 = 0.0 / 0.0;
+		}
+
+		plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+		deg_freedom = n - m - 1;
+		if (deg_freedom > 0) {
+			var_res = ss_avg_res / deg_freedom;
+			adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+		} else {
+			var_res = 0.0 / 0.0;
+			adjusted_R2_nobias = 0.0 / 0.0;
+			print ("Warning: zero or negative number of degrees of freedom.");
+		}
+
+		plain_R2_vs_0 = 1 - ss_res / ss_tot;
+		if (n > m) {
+			adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+		} else {
+			adjusted_R2_vs_0 = 0.0 / 0.0;
+		}
+
+		str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
+		str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
+		str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
+		str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
+		str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
+		str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 of residual with bias included vs. total average
+		str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
+		str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 of residual with bias subtracted vs. total average
+		str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
+		if (intercept_status == 0) {
+			str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 of residual with bias included vs. zero constant
+			str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant
+		}
+
+		if (fileO != " ") {
+			write (str, fileO);
+		} else {
+			print (str);
+		}
+
+		# Prepare the output matrix
+		print ("Writing the output matrix...");
+		if (intercept_status == 2) {
+			beta_out = append (beta, beta_unscaled);
+		} else {
+			beta_out = beta;
+		}
+		
+		# Output which features give the best AIC and are being used for linear regression 
+		write (Selected, fileS, format=fmt);
+		
+		no_selected = ncol (Selected);
+		max_selected = max (Selected);
+		last = max_selected + 1;	
+		
+		if (intercept_status != 0) {
+		
+			Selected_ext = append (Selected, as.matrix (last));			
+			P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 
+
+			if (intercept_status == 2) {
+			
+				P1_beta = P1 * beta;
+				P2_beta = colSums (P1_beta);
+				P1_beta_unscaled = P1 * beta_unscaled;
+				P2_beta_unscaled = colSums(P1_beta_unscaled);
+				
+				if (max_selected < m_orig) {
+					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+					P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
+					
+					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1]; 
+					P2_beta[1, max_selected + 1] = 0;
+				
+					P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1]; 
+					P2_beta_unscaled[1, max_selected + 1] = 0;
+				}
+				beta_out = append (t(P2_beta), t(P2_beta_unscaled));
+				
+			} else {
+			
+				P1_beta = P1 * beta;
+				P2_beta = colSums (P1_beta);
+				
+				if (max_selected < m_orig) {
+					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ; 
+					P2_beta[1, max_selected + 1] = 0;
+				}
+				beta_out = t(P2_beta);
+				
+			}
+		} else {
+		
+			P1 = table (seq (1, no_selected), t(Selected)); 
+			P1_beta = P1 * beta;
+			P2_beta = colSums (P1_beta);	
+
+			if (max_selected < m_orig) {
+				P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
+			}		
+
+			beta_out = t(P2_beta);	
+		}
+		
+		write ( beta_out, fileB, format=fmt );		
+	}
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/Univar-Stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Univar-Stats.dml b/scripts/algorithms/Univar-Stats.dml
index abb3fea..62d6a28 100644
--- a/scripts/algorithms/Univar-Stats.dml
+++ b/scripts/algorithms/Univar-Stats.dml
@@ -1,150 +1,150 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# DML Script to compute univariate statistics for all attributes 
-# in a given data set
-#
-# Three inputs:
-#     $1) X - input data
-#     $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
-#             kind=1 for scale, 
-#             kind=2 for nominal,
-#             kind=3 for ordinal
-#
-# One output:
-#     $STATS) output directory in which following three statistics 
-#         files are created
-#         + base.stats - matrix with all 17 statistics (14 scale, 
-#         3 categorical) computed for all attributes
-#         + categorical.counts - matrix in which each column 
-#         gives the category-wise counts for all categories in 
-#         that attribute
-#
-#
-
-A = read($X); # data file
-K = read($TYPES); # attribute kind file
-
-# number of features/attributes
-n = ncol(A);
-
-# number of data records
-m = nrow(A);
-
-# number of statistics
-numBaseStats = 17; # (14 scale stats, 3 categorical stats)
-
-max_kind = max(K);
-
-# matrices to store computed statistics
-baseStats = matrix(0, rows=numBaseStats, cols=n);
-
-# Compute max domain size among all categorical attributes
-maxs = colMaxs(A);
-maxDomainSize = max( ppred(K, 1, ">") * maxs );
-maxDomain = as.integer(maxDomainSize);
-
-
-parfor(i in 1:n, check=0) {
-
-	# project out the i^th column
-	F = A[,i];
-
-	kind = castAsScalar(K[1,i]);
-
-	if ( kind == 1 ) {
-		#print("[" + i + "] Scale");
-		# compute SCALE statistics on the projected column
-		minimum = min(F);
-		maximum = max(F);
-		rng = maximum - minimum;
-
-		mu = mean(F);
-		m2 = moment(F, 2);
-		m3 = moment(F, 3);
-		m4 = moment(F, 4);
-
-		var = m/(m-1.0)*m2;
-		std_dev = sqrt(var);
-		se = std_dev/sqrt(m);
-		cv = std_dev/mu;
-
-		g1 = m3/(std_dev^3);
-		g2 = m4/(std_dev^4) - 3;
-		#se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); 
-		se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); 
-
-		#se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );  
-		se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); 
-
-		md = median(F); #quantile(F, 0.5);
-		iqm = interQuartileMean(F);
-
-		# place the computed statistics in output matrices
-		baseStats[1,i] = minimum;
-		baseStats[2,i] = maximum;
-		baseStats[3,i] = rng;
-
-		baseStats[4,i] = mu;
-		baseStats[5,i] = var;
-		baseStats[6,i] = std_dev;
-		baseStats[7,i] = se;
-		baseStats[8,i] = cv;
-
-		baseStats[9,i] = g1;
-		baseStats[10,i] = g2;
-		baseStats[11,i] = se_g1;
-		baseStats[12,i] = se_g2;
-
-		baseStats[13,i] = md;
-		baseStats[14,i] = iqm;
-	}
-	else {
-		if (kind == 2 | kind == 3) {
-			#print("[" + i + "] Categorical");
-			
-			# check if the categorical column has valid values
-			minF = min(F);
-			if (minF <=0) {
-				print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
-			}
-			else {
-				# compute CATEGORICAL statistics on the projected column
-				num_cat = max(F); # number of categories
-				cat_counts = table(F,1, maxDomain, 1);  # counts for each category
-
-				mode = rowIndexMax(t(cat_counts));
-				mx = max(cat_counts)
-				modeArr =  ppred(cat_counts, mx, "==")
-				numModes = sum(modeArr);
-
-				# place the computed statistics in output matrices
-				baseStats[15,i] = num_cat;
-				baseStats[16,i] = mode;
-				baseStats[17,i] = numModes;
-			}
-		}
-	}
-}
-
-write(baseStats, $STATS);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# DML Script to compute univariate statistics for all attributes 
+# in a given data set
+#
+# Three inputs:
+#     $1) X - input data
+#     $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
+#             kind=1 for scale, 
+#             kind=2 for nominal,
+#             kind=3 for ordinal
+#
+# One output:
+#     $STATS) output directory in which following three statistics 
+#         files are created
+#         + base.stats - matrix with all 17 statistics (14 scale, 
+#         3 categorical) computed for all attributes
+#         + categorical.counts - matrix in which each column 
+#         gives the category-wise counts for all categories in 
+#         that attribute
+#
+#
+
+A = read($X); # data file
+K = read($TYPES); # attribute kind file
+
+# number of features/attributes
+n = ncol(A);
+
+# number of data records
+m = nrow(A);
+
+# number of statistics
+numBaseStats = 17; # (14 scale stats, 3 categorical stats)
+
+max_kind = max(K);
+
+# matrices to store computed statistics
+baseStats = matrix(0, rows=numBaseStats, cols=n);
+
+# Compute max domain size among all categorical attributes
+maxs = colMaxs(A);
+maxDomainSize = max( ppred(K, 1, ">") * maxs );
+maxDomain = as.integer(maxDomainSize);
+
+
+parfor(i in 1:n, check=0) {
+
+	# project out the i^th column
+	F = A[,i];
+
+	kind = castAsScalar(K[1,i]);
+
+	if ( kind == 1 ) {
+		#print("[" + i + "] Scale");
+		# compute SCALE statistics on the projected column
+		minimum = min(F);
+		maximum = max(F);
+		rng = maximum - minimum;
+
+		mu = mean(F);
+		m2 = moment(F, 2);
+		m3 = moment(F, 3);
+		m4 = moment(F, 4);
+
+		var = m/(m-1.0)*m2;
+		std_dev = sqrt(var);
+		se = std_dev/sqrt(m);
+		cv = std_dev/mu;
+
+		g1 = m3/(std_dev^3);
+		g2 = m4/(std_dev^4) - 3;
+		#se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); 
+		se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); 
+
+		#se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );  
+		se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); 
+
+		md = median(F); #quantile(F, 0.5);
+		iqm = interQuartileMean(F);
+
+		# place the computed statistics in output matrices
+		baseStats[1,i] = minimum;
+		baseStats[2,i] = maximum;
+		baseStats[3,i] = rng;
+
+		baseStats[4,i] = mu;
+		baseStats[5,i] = var;
+		baseStats[6,i] = std_dev;
+		baseStats[7,i] = se;
+		baseStats[8,i] = cv;
+
+		baseStats[9,i] = g1;
+		baseStats[10,i] = g2;
+		baseStats[11,i] = se_g1;
+		baseStats[12,i] = se_g2;
+
+		baseStats[13,i] = md;
+		baseStats[14,i] = iqm;
+	}
+	else {
+		if (kind == 2 | kind == 3) {
+			#print("[" + i + "] Categorical");
+			
+			# check if the categorical column has valid values
+			minF = min(F);
+			if (minF <=0) {
+				print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
+			}
+			else {
+				# compute CATEGORICAL statistics on the projected column
+				num_cat = max(F); # number of categories
+				cat_counts = table(F,1, maxDomain, 1);  # counts for each category
+
+				mode = rowIndexMax(t(cat_counts));
+				mx = max(cat_counts)
+				modeArr =  ppred(cat_counts, mx, "==")
+				numModes = sum(modeArr);
+
+				# place the computed statistics in output matrices
+				baseStats[15,i] = num_cat;
+				baseStats[16,i] = mode;
+				baseStats[17,i] = numModes;
+			}
+		}
+	}
+}
+
+write(baseStats, $STATS);
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/bivar-stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/bivar-stats.dml b/scripts/algorithms/bivar-stats.dml
index 4846f56..99549dc 100644
--- a/scripts/algorithms/bivar-stats.dml
+++ b/scripts/algorithms/bivar-stats.dml
@@ -1,398 +1,398 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-#
-# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs 
-#   Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n} 
-#          compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
-#
-# Six inputs:  
-#    1) X  - input data
-#    2) index1 - First attribute set {A_11, A_12, ... A_1m}
-#    3) index2 - Second attribute set {A_21, A_22, ... A_2n}
-#    4) types1 - kind for attributes in S1 
-#    5) types2 - kind for attributes in S2
-#             kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
-# 
-# One output:    
-#    6) output directory in which following (maximum of) four statistics files are created
-#        + bivar.scale.scale.stats - matrix containing scale-scale correlations
-#        + bivar.nominal.nominal.stats - 
-#        + bivar.nominal.scale.stats - 
-#        + bivar.ordinal.ordinal.stats - 
-#
-# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data>
-#                                                    index1=<Feature Index Set 1>
-#                                                    index2=<Feature Index Set 2>
-#                                                    types1=<Feature Types 1>
-#                                                    types2=<Feature Types 2>
-#                                                    OUTDIR=<Output Location>
-
-D = read($X);  # input data set
-S1 = read($index1); # attribute set 1
-S2 = read($index2); # attribute set 2
-K1 = read($types1); # kind for attributes in S1
-K2 = read($types2); # kind for attributes in S2
-
-s1size = ncol(S1);
-s2size = ncol(S2);
-numPairs = s1size * s2size;
-
-#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho
-# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test
-
-num_scale_scale_tests = 0
-num_nominal_nominal_tests = 0
-num_ordinal_ordinal_tests = 0
-num_nominal_scale_tests = 0
-
-pair2row = matrix(0, rows=numPairs, cols=2)
-for( i in 1:s1size, check=0) {
-    pre_a1 = castAsScalar(S1[1,i]);
-    pre_k1 = castAsScalar(K1[1,i]);
-
-    for( j in 1:s2size, check=0) {
-        pre_pairID = (i-1)*s2size+j; 
-        pre_a2 = castAsScalar(S2[1,j]);
-        pre_k2 = castAsScalar(K2[1,j]);
-	
-	if (pre_k1 == pre_k2) {
-            if (pre_k1 == 1) {
-	        	num_scale_scale_tests = num_scale_scale_tests + 1
-				pair2row[pre_pairID,1] = num_scale_scale_tests
-            } else {
-	      		num_nominal_nominal_tests = num_nominal_nominal_tests + 1
-				pair2row[pre_pairID,1] = num_nominal_nominal_tests
-		
-                if ( pre_k1 == 3 ) {
-		    		num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1
-		    		pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests
-                }
-            }
-        }
-        else {
-            if (pre_k1 == 1 | pre_k2 == 1) {
-	        	num_nominal_scale_tests = num_nominal_scale_tests + 1
-				pair2row[pre_pairID,1] = num_nominal_scale_tests
-            } else {
-	        	num_nominal_nominal_tests = num_nominal_nominal_tests + 1
-				pair2row[pre_pairID,1] = num_nominal_nominal_tests 
-            }
-		}
-    }
-}
-
-size_scale_scale_tests     = max(num_scale_scale_tests, 1);
-size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1)
-size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1);
-size_nominal_scale_tests   = max(num_nominal_scale_tests, 1);
-
-basestats                 = matrix(0, rows=11, cols=numPairs);
-basestats_scale_scale     = matrix(0, rows=6, cols=size_scale_scale_tests)
-basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests)
-basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests)
-basestats_nominal_scale   = matrix(0, rows=11, cols=size_nominal_scale_tests)
-
-
-# Compute max domain size among all categorical attributes
-# and check if these cols have been recoded
-
-debug_str = "Stopping execution of DML script due to invalid input";
-
-error_flag = FALSE;
-
-maxs = colMaxs(D);
-mins = colMins(D)
-maxDomainSize = -1.0;
-for(k in 1:ncol(K1) ) {
-  type = as.scalar(K1[1,k]);
-  
-  if ( type > 1) {
-    colID = as.scalar(S1[1,k]);
-    
-    colMaximum = as.scalar(maxs[1,colID]);
-    if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
-  
-  	colMinimum = as.scalar(mins[1,colID]);
-  	if(colMinimum < 1){
-  	  if(type == 2)
-  	    debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
-  	  else
-  	    debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
-  	  error_flag = TRUE;
-  	}
-  }
-}
-
-for(k in 1:ncol(K2) ) {
-  type = as.scalar(K2[1,k]);
-  
-  if ( type > 1) {
-    colID = as.scalar(S2[1,k]);
-    
-    colMaximum = as.scalar(maxs[1,colID]);
-    if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
-  
-  	colMinimum = as.scalar(mins[1,colID]);
-  	if(colMinimum < 1){
-  	  if(type == 2)
-  	    debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
-  	  else 
-  	  	debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
-  	  error_flag = TRUE;
-  	}
-  }
-}
-maxDomain = as.integer(maxDomainSize);
-
-if(error_flag) stop(debug_str);
-
-parfor( i in 1:s1size, check=0) {
-    a1 = castAsScalar(S1[1,i]);
-    k1 = castAsScalar(K1[1,i]);
-    A1 = D[,a1];
-
-    parfor( j in 1:s2size, check=0) {
-        pairID = (i-1)*s2size+j; 
-        a2 = castAsScalar(S2[1,j]);
-        k2 = castAsScalar(K2[1,j]);
-        A2 = D[,a2];
-
-		rowid1 = castAsScalar(pair2row[pairID, 1])
-    	rowid2 = castAsScalar(pair2row[pairID, 2])
-
-        if (k1 == k2) {
-            if (k1 == 1) {
-                # scale-scale
-                print("[" + i + "," + j + "] scale-scale");
-                [r, cov, sigma1, sigma2] = bivar_ss(A1,A2);   
-		
-    			basestats_scale_scale[1,rowid1] = a1;
-				basestats_scale_scale[2,rowid1] = a2;	
-                basestats_scale_scale[3,rowid1] = r;
-                basestats_scale_scale[4,rowid1] = cov;
-                basestats_scale_scale[5,rowid1] = sigma1;
-                basestats_scale_scale[6,rowid1] = sigma2;
-            } else {
-                # nominal-nominal or ordinal-ordinal
-                print("[" + i + "," + j + "] categorical-categorical");
-                [chisq, df, pval, cramersv]  = bivar_cc(A1, A2, maxDomain);
-
-                basestats_nominal_nominal[1,rowid1] = a1;
-				basestats_nominal_nominal[2,rowid1] = a2;	
-                basestats_nominal_nominal[3,rowid1] = chisq;
-                basestats_nominal_nominal[4,rowid1] = df;
-                basestats_nominal_nominal[5,rowid1] = pval;
-                basestats_nominal_nominal[6,rowid1] = cramersv;
-
-                if ( k1 == 3 ) {
-                    # ordinal-ordinal
-                    print("[" + i + "," + j + "] ordinal-ordinal");
-                    sp = bivar_oo(A1, A2, maxDomain);
-
-                    basestats_ordinal_ordinal[1,rowid2] = a1;
-                    basestats_ordinal_ordinal[2,rowid2] = a2;
-                    basestats_ordinal_ordinal[3,rowid2] = sp;
-                }
-            }
-        } else {
-            if (k1 == 1 | k2 == 1) {
-                # Scale-nominal/ordinal     
-                print("[" + i + "," + j + "] scale-categorical");
-                
-               	if ( k1 == 1 ) {
-                	[eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain);
-                } else {
-                    [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain);
-                }
-		
-                basestats_nominal_scale[1,rowid1] = a1;
-                basestats_nominal_scale[2,rowid1] = a2;
-                basestats_nominal_scale[3,rowid1] = eta;
-                basestats_nominal_scale[4,rowid1] = f;
-                basestats_nominal_scale[5,rowid1] = pval;
-                basestats_nominal_scale[6,rowid1] = bw_ss;
-                basestats_nominal_scale[7,rowid1] = within_ss;
-                basestats_nominal_scale[8,rowid1] = bw_df;
-                basestats_nominal_scale[9,rowid1] = within_df;
-                basestats_nominal_scale[10,rowid1] = bw_mean_square;
-                basestats_nominal_scale[11,rowid1] = within_mean_square;
-            } else {
-                # nominal-ordinal or ordinal-nominal
-                print("[" + i + "," + j + "] categorical-categorical");
-                [chisq, df, pval, cramersv]  = bivar_cc(A1, A2, maxDomain);
-
-				basestats_nominal_nominal[1,rowid1] = a1;
-				basestats_nominal_nominal[2,rowid1] = a2;
-                basestats_nominal_nominal[3,rowid1] = chisq;
-                basestats_nominal_nominal[4,rowid1] = df;
-                basestats_nominal_nominal[5,rowid1] = pval;
-                basestats_nominal_nominal[6,rowid1] = cramersv;
-            }
-        }
-    }
-}
-
-if(num_scale_scale_tests == size_scale_scale_tests){
-  write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats");
-}
-
-if(num_nominal_scale_tests == size_nominal_scale_tests){
-  write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats");
-}
-
-if(num_nominal_nominal_tests == size_nominal_nominal_tests){
-  write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats");
-}
-
-if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){
-  write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats");
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) {
-
-    # Contingency Table
-    F = table(A, B, maxDomain, maxDomain);
-    F = F[1:max(A), 1:max(B)];
-
-    # Chi-Squared
-    W = sum(F);
-    r = rowSums(F);
-    c = colSums(F);
-    E = (r %*% c)/W;
-    T = (F-E)^2/E;
-    chi_squared = sum(T);
-
-    # compute p-value
-    degFreedom = (nrow(F)-1)*(ncol(F)-1);
-    pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
-    # Cramer's V
-    R = nrow(F);
-    C = ncol(F);
-    q = min(R,C);
-    cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-    # Assign return values
-    chisq = chi_squared;
-    df = as.double(degFreedom);
-    pval = pValue;
-    cramersv = cramers_v;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
-
-    # Unweighted co-variance
-    covXY = cov(X,Y);
-
-    # compute standard deviations for both X and Y by computing 2^nd central moment
-    W = nrow(X);
-    m2X = moment(X,2);
-    m2Y = moment(Y,2);
-    sigmaX = sqrt(m2X * (W/(W-1.0)) );
-    sigmaY = sqrt(m2Y * (W/(W-1.0)) );
-
-    # Pearson's R
-    R = covXY / (sigmaX*sigmaY);
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain) 
-		   return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) {
-
-    # mean and variance in target variable
-    W = nrow(A);
-    my = mean(Y);
-    varY = moment(Y,2) * W/(W-1.0)
-
-    # category-wise (frequencies, means, variances)
-    CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain); 
-    CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain);
-    CVars =  aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain);
-    
-    # number of categories
-    R = nrow(CFreqs);
-
-    Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
-
-    bw_ss = sum( (CFreqs*(CMeans-my)^2) );
-    bw_df = as.double(R-1);
-    bw_mean_square = bw_ss/bw_df;
-	
-    within_ss = sum( (CFreqs-1)*CVars );
-    within_df = as.double(W-R);
-    within_mean_square = within_ss/within_df;
-	
-    AnovaF = bw_mean_square/within_mean_square;
-    
-    pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE)
-}
-
-
-# -----------------------------------------------------------------------------------------------------------
-# Function to compute ranks
-# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
-computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
-    Ranks = cumsum(X) - X/2 + 1/2;
-}
-
-#-------------------------------------------------------------------------
-
-bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) {
-
-    # compute contingency table
-    F = table(A, B, maxDomain, maxDomain);
-    F = F[1:max(A), 1:max(B)];
-    
-    catA = nrow(F);  # number of categories in A
-    catB = ncol(F);  # number of categories in B
-
-    # compute category-wise counts for both the attributes
-    R = rowSums(F);
-    S = colSums(F);
-
-    # compute scores, both are column vectors
-    [C] = computeRanks(R);
-    meanX = mean(C,R); 
-
-    columnS = t(S);
-    [D] = computeRanks(columnS);
-
-    # scores (C,D) are individual values, and counts (R,S) act as weights
-    meanY = mean(D,columnS);
-
-    W = sum(F); # total weight, or total #cases
-    varX = moment(C,R,2)*(W/(W-1.0));
-    varY = moment(D,columnS,2)*(W/(W-1.0));
-    covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
-
-    sp = covXY/(sqrt(varX)*sqrt(varY));
-}
-
-# -----------------------------------------------------------------------------------------------------------
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+#
+# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs 
+#   Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n} 
+#          compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
+#
+# Six inputs:  
+#    1) X  - input data
+#    2) index1 - First attribute set {A_11, A_12, ... A_1m}
+#    3) index2 - Second attribute set {A_21, A_22, ... A_2n}
+#    4) types1 - kind for attributes in S1 
+#    5) types2 - kind for attributes in S2
+#             kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
+# 
+# One output:    
+#    6) output directory in which following (maximum of) four statistics files are created
+#        + bivar.scale.scale.stats - matrix containing scale-scale correlations
+#        + bivar.nominal.nominal.stats - 
+#        + bivar.nominal.scale.stats - 
+#        + bivar.ordinal.ordinal.stats - 
+#
+# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data>
+#                                                    index1=<Feature Index Set 1>
+#                                                    index2=<Feature Index Set 2>
+#                                                    types1=<Feature Types 1>
+#                                                    types2=<Feature Types 2>
+#                                                    OUTDIR=<Output Location>
+
+D = read($X);  # input data set
+S1 = read($index1); # attribute set 1
+S2 = read($index2); # attribute set 2
+K1 = read($types1); # kind for attributes in S1
+K2 = read($types2); # kind for attributes in S2
+
+s1size = ncol(S1);
+s2size = ncol(S2);
+numPairs = s1size * s2size;
+
+#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho
+# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test
+
+num_scale_scale_tests = 0
+num_nominal_nominal_tests = 0
+num_ordinal_ordinal_tests = 0
+num_nominal_scale_tests = 0
+
+pair2row = matrix(0, rows=numPairs, cols=2)
+for( i in 1:s1size, check=0) {
+    pre_a1 = castAsScalar(S1[1,i]);
+    pre_k1 = castAsScalar(K1[1,i]);
+
+    for( j in 1:s2size, check=0) {
+        pre_pairID = (i-1)*s2size+j; 
+        pre_a2 = castAsScalar(S2[1,j]);
+        pre_k2 = castAsScalar(K2[1,j]);
+	
+	if (pre_k1 == pre_k2) {
+            if (pre_k1 == 1) {
+	        	num_scale_scale_tests = num_scale_scale_tests + 1
+				pair2row[pre_pairID,1] = num_scale_scale_tests
+            } else {
+	      		num_nominal_nominal_tests = num_nominal_nominal_tests + 1
+				pair2row[pre_pairID,1] = num_nominal_nominal_tests
+		
+                if ( pre_k1 == 3 ) {
+		    		num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1
+		    		pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests
+                }
+            }
+        }
+        else {
+            if (pre_k1 == 1 | pre_k2 == 1) {
+	        	num_nominal_scale_tests = num_nominal_scale_tests + 1
+				pair2row[pre_pairID,1] = num_nominal_scale_tests
+            } else {
+	        	num_nominal_nominal_tests = num_nominal_nominal_tests + 1
+				pair2row[pre_pairID,1] = num_nominal_nominal_tests 
+            }
+		}
+    }
+}
+
+size_scale_scale_tests     = max(num_scale_scale_tests, 1);
+size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1)
+size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1);
+size_nominal_scale_tests   = max(num_nominal_scale_tests, 1);
+
+basestats                 = matrix(0, rows=11, cols=numPairs);
+basestats_scale_scale     = matrix(0, rows=6, cols=size_scale_scale_tests)
+basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests)
+basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests)
+basestats_nominal_scale   = matrix(0, rows=11, cols=size_nominal_scale_tests)
+
+
+# Compute max domain size among all categorical attributes
+# and check if these cols have been recoded
+
+debug_str = "Stopping execution of DML script due to invalid input";
+
+error_flag = FALSE;
+
+maxs = colMaxs(D);
+mins = colMins(D)
+maxDomainSize = -1.0;
+for(k in 1:ncol(K1) ) {
+  type = as.scalar(K1[1,k]);
+  
+  if ( type > 1) {
+    colID = as.scalar(S1[1,k]);
+    
+    colMaximum = as.scalar(maxs[1,colID]);
+    if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
+  
+  	colMinimum = as.scalar(mins[1,colID]);
+  	if(colMinimum < 1){
+  	  if(type == 2)
+  	    debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
+  	  else
+  	    debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
+  	  error_flag = TRUE;
+  	}
+  }
+}
+
+for(k in 1:ncol(K2) ) {
+  type = as.scalar(K2[1,k]);
+  
+  if ( type > 1) {
+    colID = as.scalar(S2[1,k]);
+    
+    colMaximum = as.scalar(maxs[1,colID]);
+    if(maxDomainSize < colMaximum) maxDomainSize = colMaximum;
+  
+  	colMinimum = as.scalar(mins[1,colID]);
+  	if(colMinimum < 1){
+  	  if(type == 2)
+  	    debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum)
+  	  else 
+  	  	debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum)
+  	  error_flag = TRUE;
+  	}
+  }
+}
+maxDomain = as.integer(maxDomainSize);
+
+if(error_flag) stop(debug_str);
+
+parfor( i in 1:s1size, check=0) {
+    a1 = castAsScalar(S1[1,i]);
+    k1 = castAsScalar(K1[1,i]);
+    A1 = D[,a1];
+
+    parfor( j in 1:s2size, check=0) {
+        pairID = (i-1)*s2size+j; 
+        a2 = castAsScalar(S2[1,j]);
+        k2 = castAsScalar(K2[1,j]);
+        A2 = D[,a2];
+
+		rowid1 = castAsScalar(pair2row[pairID, 1])
+    	rowid2 = castAsScalar(pair2row[pairID, 2])
+
+        if (k1 == k2) {
+            if (k1 == 1) {
+                # scale-scale
+                print("[" + i + "," + j + "] scale-scale");
+                [r, cov, sigma1, sigma2] = bivar_ss(A1,A2);   
+		
+    			basestats_scale_scale[1,rowid1] = a1;
+				basestats_scale_scale[2,rowid1] = a2;	
+                basestats_scale_scale[3,rowid1] = r;
+                basestats_scale_scale[4,rowid1] = cov;
+                basestats_scale_scale[5,rowid1] = sigma1;
+                basestats_scale_scale[6,rowid1] = sigma2;
+            } else {
+                # nominal-nominal or ordinal-ordinal
+                print("[" + i + "," + j + "] categorical-categorical");
+                [chisq, df, pval, cramersv]  = bivar_cc(A1, A2, maxDomain);
+
+                basestats_nominal_nominal[1,rowid1] = a1;
+				basestats_nominal_nominal[2,rowid1] = a2;	
+                basestats_nominal_nominal[3,rowid1] = chisq;
+                basestats_nominal_nominal[4,rowid1] = df;
+                basestats_nominal_nominal[5,rowid1] = pval;
+                basestats_nominal_nominal[6,rowid1] = cramersv;
+
+                if ( k1 == 3 ) {
+                    # ordinal-ordinal
+                    print("[" + i + "," + j + "] ordinal-ordinal");
+                    sp = bivar_oo(A1, A2, maxDomain);
+
+                    basestats_ordinal_ordinal[1,rowid2] = a1;
+                    basestats_ordinal_ordinal[2,rowid2] = a2;
+                    basestats_ordinal_ordinal[3,rowid2] = sp;
+                }
+            }
+        } else {
+            if (k1 == 1 | k2 == 1) {
+                # Scale-nominal/ordinal     
+                print("[" + i + "," + j + "] scale-categorical");
+                
+               	if ( k1 == 1 ) {
+                	[eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain);
+                } else {
+                    [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain);
+                }
+		
+                basestats_nominal_scale[1,rowid1] = a1;
+                basestats_nominal_scale[2,rowid1] = a2;
+                basestats_nominal_scale[3,rowid1] = eta;
+                basestats_nominal_scale[4,rowid1] = f;
+                basestats_nominal_scale[5,rowid1] = pval;
+                basestats_nominal_scale[6,rowid1] = bw_ss;
+                basestats_nominal_scale[7,rowid1] = within_ss;
+                basestats_nominal_scale[8,rowid1] = bw_df;
+                basestats_nominal_scale[9,rowid1] = within_df;
+                basestats_nominal_scale[10,rowid1] = bw_mean_square;
+                basestats_nominal_scale[11,rowid1] = within_mean_square;
+            } else {
+                # nominal-ordinal or ordinal-nominal
+                print("[" + i + "," + j + "] categorical-categorical");
+                [chisq, df, pval, cramersv]  = bivar_cc(A1, A2, maxDomain);
+
+				basestats_nominal_nominal[1,rowid1] = a1;
+				basestats_nominal_nominal[2,rowid1] = a2;
+                basestats_nominal_nominal[3,rowid1] = chisq;
+                basestats_nominal_nominal[4,rowid1] = df;
+                basestats_nominal_nominal[5,rowid1] = pval;
+                basestats_nominal_nominal[6,rowid1] = cramersv;
+            }
+        }
+    }
+}
+
+if(num_scale_scale_tests == size_scale_scale_tests){
+  write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats");
+}
+
+if(num_nominal_scale_tests == size_nominal_scale_tests){
+  write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats");
+}
+
+if(num_nominal_nominal_tests == size_nominal_nominal_tests){
+  write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats");
+}
+
+if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){
+  write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats");
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) {
+
+    # Contingency Table
+    F = table(A, B, maxDomain, maxDomain);
+    F = F[1:max(A), 1:max(B)];
+
+    # Chi-Squared
+    W = sum(F);
+    r = rowSums(F);
+    c = colSums(F);
+    E = (r %*% c)/W;
+    T = (F-E)^2/E;
+    chi_squared = sum(T);
+
+    # compute p-value
+    degFreedom = (nrow(F)-1)*(ncol(F)-1);
+    pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+    # Cramer's V
+    R = nrow(F);
+    C = ncol(F);
+    q = min(R,C);
+    cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+    # Assign return values
+    chisq = chi_squared;
+    df = as.double(degFreedom);
+    pval = pValue;
+    cramersv = cramers_v;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
+
+    # Unweighted co-variance
+    covXY = cov(X,Y);
+
+    # compute standard deviations for both X and Y by computing 2^nd central moment
+    W = nrow(X);
+    m2X = moment(X,2);
+    m2Y = moment(Y,2);
+    sigmaX = sqrt(m2X * (W/(W-1.0)) );
+    sigmaY = sqrt(m2Y * (W/(W-1.0)) );
+
+    # Pearson's R
+    R = covXY / (sigmaX*sigmaY);
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain) 
+		   return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) {
+
+    # mean and variance in target variable
+    W = nrow(A);
+    my = mean(Y);
+    varY = moment(Y,2) * W/(W-1.0)
+
+    # category-wise (frequencies, means, variances)
+    CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain); 
+    CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain);
+    CVars =  aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain);
+    
+    # number of categories
+    R = nrow(CFreqs);
+
+    Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
+
+    bw_ss = sum( (CFreqs*(CMeans-my)^2) );
+    bw_df = as.double(R-1);
+    bw_mean_square = bw_ss/bw_df;
+	
+    within_ss = sum( (CFreqs-1)*CVars );
+    within_df = as.double(W-R);
+    within_mean_square = within_ss/within_df;
+	
+    AnovaF = bw_mean_square/within_mean_square;
+    
+    pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE)
+}
+
+
+# -----------------------------------------------------------------------------------------------------------
+# Function to compute ranks
+# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
+computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
+    Ranks = cumsum(X) - X/2 + 1/2;
+}
+
+#-------------------------------------------------------------------------
+
+bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) {
+
+    # compute contingency table
+    F = table(A, B, maxDomain, maxDomain);
+    F = F[1:max(A), 1:max(B)];
+    
+    catA = nrow(F);  # number of categories in A
+    catB = ncol(F);  # number of categories in B
+
+    # compute category-wise counts for both the attributes
+    R = rowSums(F);
+    S = colSums(F);
+
+    # compute scores, both are column vectors
+    [C] = computeRanks(R);
+    meanX = mean(C,R); 
+
+    columnS = t(S);
+    [D] = computeRanks(columnS);
+
+    # scores (C,D) are individual values, and counts (R,S) act as weights
+    meanY = mean(D,columnS);
+
+    W = sum(F); # total weight, or total #cases
+    varX = moment(C,R,2)*(W/(W-1.0));
+    varY = moment(D,columnS,2)*(W/(W-1.0));
+    covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
+
+    sp = covXY/(sqrt(varX)*sqrt(varY));
+}
+
+# -----------------------------------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/decision-tree-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/decision-tree-predict.dml b/scripts/algorithms/decision-tree-predict.dml
index 9e01adb..3447da6 100644
--- a/scripts/algorithms/decision-tree-predict.dml
+++ b/scripts/algorithms/decision-tree-predict.dml
@@ -1,142 +1,142 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET.
-#
-# INPUT         PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME          TYPE     DEFAULT      MEANING
-# ---------------------------------------------------------------------------------------------
-# X             String   ---          Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded 
-# Y	 		    String   " "		  Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
-# R   	  		String   " "	      Location to read matrix R which for each feature in X contains the following information 
-#										- R[,1]: column ids
-#										- R[,2]: start indices 
-#										- R[,3]: end indices
-#									  If R is not provided by default all variables are assumed to be scale
-# M             String 	 ---	   	  Location to read matrix M containing the learned tree in the following format
-#								 		- M[1,j]: id of node j (in a complete binary tree)
-#	 									- M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-#	 									- M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
-#	 									- M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, 
-#		     									  otherwise the label that leaf node j is supposed to predict
-#	 									- M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values 
-#			 									  stored in rows 6,7,... if j is categorical 
-#						 						  If j is a leaf node: number of misclassified samples reaching at node j 
-#	 									- M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] 
-#							   					   if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,... 
-#												   depict the value subset chosen for j
-#	          									   If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-# P				String   ---		  Location to store the label predictions for X
-# A     		String   " "          Location to write the test accuracy (%) for the prediction if requested
-# CM     		String   " "		  Location to write the confusion matrix if requested 
-# fmt     	    String   "text"       The output format of the output, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUT: 
-#	1- Matrix Y containing the predicted labels for X 
-#   2- Test accuracy if requested
-#   3- Confusion matrix C if requested
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
-#														A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv
-
-fileX = $X;
-fileM = $M;
-fileP = $P;
-fileY = ifdef ($Y, " ");
-fileR = ifdef ($R, " ");
-fileCM = ifdef ($CM, " ");
-fileA = ifdef ($A, " ");
-fmtO = ifdef ($fmt, "text");
-X_test = read (fileX);
-M = read (fileM);
-
-num_records = nrow (X_test);
-Y_predicted = matrix (0, rows = num_records, cols = 1);
-
-R_cat = matrix (0, rows = 1, cols = 1);
-R_scale = matrix (0, rows = 1, cols = 1);
-
-if (fileR != " ") {
-	R = read (fileR);
-	dummy_coded = ppred (R[,2], R[,3], "!=");
-	R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
-	R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
-} else { # only scale features available
-	R_scale = seq (1, ncol (X_test));
-}
-
-parfor (i in 1:num_records, check = 0) {
-	cur_sample = X_test[i,];
-	cur_node_pos = 1;
-	label_found = FALSE;
-	while (!label_found) {
-		cur_feature = as.scalar (M[3,cur_node_pos]);	
-		type_label = as.scalar (M[4,cur_node_pos]);
-		if (cur_feature == 0) { # leaf node
-			label_found = TRUE;
-			Y_predicted[i,] = type_label;
-		} else {
-			# determine type: 1 for scale, 2 for categorical 
-			if (type_label == 1) { # scale feature
-				cur_start_ind = as.scalar (R_scale[cur_feature,]);
-				cur_value = as.scalar (cur_sample[,cur_start_ind]);
-				cur_split = as.scalar (M[6,cur_node_pos]);
-				if (cur_value < cur_split) { # go to left branch
-					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
-				} else { # go to right branch
-					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
-				}
-			} else if (type_label == 2) { # categorical feature				
-				cur_start_ind = as.scalar (R_cat[cur_feature,1]);
-				cur_end_ind = as.scalar (R_cat[cur_feature,2]);					
-				cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]);
-				cur_offset = as.scalar (M[5,cur_node_pos]);
-				value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "=="));
-				if (value_found) { # go to left branch
-					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
-				} else { # go to right branch
-					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
-				}
-}}}}
-
-write (Y_predicted, fileP, format = fmtO);
-
-if (fileY != " ") {
-	Y_test = read (fileY);
-	num_classes = ncol (Y_test);
-	Y_test = rowSums (Y_test * t (seq (1, num_classes)));
-	result = ppred (Y_test, Y_predicted, "==");
-	result = sum (result);
-	accuracy = result / num_records * 100;
-	acc_str = "Accuracy (%): " + accuracy;
-	if (fileA != " ") {
-		write (acc_str, fileA, format = fmtO);
-	} else {
-		print (acc_str);
-	}
-	if (fileCM != " ") {
-		confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes)
-        write(confusion_mat, fileCM, format = fmtO)
-	}
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET.
+#
+# INPUT         PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME          TYPE     DEFAULT      MEANING
+# ---------------------------------------------------------------------------------------------
+# X             String   ---          Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded 
+# Y	 		    String   " "		  Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
+# R   	  		String   " "	      Location to read matrix R which for each feature in X contains the following information 
+#										- R[,1]: column ids
+#										- R[,2]: start indices 
+#										- R[,3]: end indices
+#									  If R is not provided by default all variables are assumed to be scale
+# M             String 	 ---	   	  Location to read matrix M containing the learned tree in the following format
+#								 		- M[1,j]: id of node j (in a complete binary tree)
+#	 									- M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
+#	 									- M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
+#	 									- M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, 
+#		     									  otherwise the label that leaf node j is supposed to predict
+#	 									- M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values 
+#			 									  stored in rows 6,7,... if j is categorical 
+#						 						  If j is a leaf node: number of misclassified samples reaching at node j 
+#	 									- M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] 
+#							   					   if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,... 
+#												   depict the value subset chosen for j
+#	          									   If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+# P				String   ---		  Location to store the label predictions for X
+# A     		String   " "          Location to write the test accuracy (%) for the prediction if requested
+# CM     		String   " "		  Location to write the confusion matrix if requested 
+# fmt     	    String   "text"       The output format of the output, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUT: 
+#	1- Matrix Y containing the predicted labels for X 
+#   2- Test accuracy if requested
+#   3- Confusion matrix C if requested
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
+#														A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv
+
+fileX = $X;
+fileM = $M;
+fileP = $P;
+fileY = ifdef ($Y, " ");
+fileR = ifdef ($R, " ");
+fileCM = ifdef ($CM, " ");
+fileA = ifdef ($A, " ");
+fmtO = ifdef ($fmt, "text");
+X_test = read (fileX);
+M = read (fileM);
+
+num_records = nrow (X_test);
+Y_predicted = matrix (0, rows = num_records, cols = 1);
+
+R_cat = matrix (0, rows = 1, cols = 1);
+R_scale = matrix (0, rows = 1, cols = 1);
+
+if (fileR != " ") {
+	R = read (fileR);
+	dummy_coded = ppred (R[,2], R[,3], "!=");
+	R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
+	R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
+} else { # only scale features available
+	R_scale = seq (1, ncol (X_test));
+}
+
+parfor (i in 1:num_records, check = 0) {
+	cur_sample = X_test[i,];
+	cur_node_pos = 1;
+	label_found = FALSE;
+	while (!label_found) {
+		cur_feature = as.scalar (M[3,cur_node_pos]);	
+		type_label = as.scalar (M[4,cur_node_pos]);
+		if (cur_feature == 0) { # leaf node
+			label_found = TRUE;
+			Y_predicted[i,] = type_label;
+		} else {
+			# determine type: 1 for scale, 2 for categorical 
+			if (type_label == 1) { # scale feature
+				cur_start_ind = as.scalar (R_scale[cur_feature,]);
+				cur_value = as.scalar (cur_sample[,cur_start_ind]);
+				cur_split = as.scalar (M[6,cur_node_pos]);
+				if (cur_value < cur_split) { # go to left branch
+					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
+				} else { # go to right branch
+					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
+				}
+			} else if (type_label == 2) { # categorical feature				
+				cur_start_ind = as.scalar (R_cat[cur_feature,1]);
+				cur_end_ind = as.scalar (R_cat[cur_feature,2]);					
+				cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]);
+				cur_offset = as.scalar (M[5,cur_node_pos]);
+				value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "=="));
+				if (value_found) { # go to left branch
+					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]);
+				} else { # go to right branch
+					cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1;
+				}
+}}}}
+
+write (Y_predicted, fileP, format = fmtO);
+
+if (fileY != " ") {
+	Y_test = read (fileY);
+	num_classes = ncol (Y_test);
+	Y_test = rowSums (Y_test * t (seq (1, num_classes)));
+	result = ppred (Y_test, Y_predicted, "==");
+	result = sum (result);
+	accuracy = result / num_records * 100;
+	acc_str = "Accuracy (%): " + accuracy;
+	if (fileA != " ") {
+		write (acc_str, fileA, format = fmtO);
+	} else {
+		print (acc_str);
+	}
+	if (fileCM != " ") {
+		confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes)
+        write(confusion_mat, fileCM, format = fmtO)
+	}
+}