You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:06 UTC

[30/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/stratstats.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/stratstats.dml b/src/test/scripts/applications/ctableStats/stratstats.dml
index 3ac684a..7eb1858 100644
--- a/src/test/scripts/applications/ctableStats/stratstats.dml
+++ b/src/test/scripts/applications/ctableStats/stratstats.dml
@@ -1,350 +1,350 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# STRATIFIED BIVARIATE STATISTICS, VERSION 2
-# 
-# INPUT  1: Dataset with records as rows (matrix filename)
-# INPUT  2: The stratum ID column number (integer)
-#   Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive.
-# INPUT  3: 1st variate column numbers (matrix filename)
-# INPUT  4: 2nd variate column numbers (matrix filename)
-# INPUT  5: Output (matrix filename)
-#
-# OUTPUT 1: Output Matrix with 40 columns, containing the following information:
-#     Rows: One row per each distinct pair (1st variate, 2nd variate)
-#     Col 01: 1st variate column number
-#     Col 02: 1st variate global presence count
-#     Col 03: 1st variate global mean
-#     Col 04: 1st variate global standard deviation
-#     Col 05: 1st variate stratified standard deviation
-#     Col 06: R-squared, 1st variate vs. strata
-#     Col 07: P-value, 1st variate vs. strata
-#     Col 08-10: Reserved
-#     Col 11: 2nd variate column number
-#     Col 12: 2nd variate global presence count
-#     Col 13: 2nd variate global mean
-#     Col 14: 2nd variate global standard deviation
-#     Col 15: 2nd variate stratified standard deviation
-#     Col 16: R-squared, 2nd variate vs. strata
-#     Col 17: P-value, 2nd variate vs. strata
-#     Col 18-20: Reserved
-#     Col 21: Global 1st & 2nd variate presence count
-#     Col 22: Global regression slope (2nd variate vs. 1st variate)
-#     Col 23: Global regression slope standard deviation
-#     Col 24: Global correlation = +/- sqrt(R-squared)
-#     Col 25: Global residual standard deviation
-#     Col 26: Global R-squared
-#     Col 27: Global P-value for hypothesis "slope = 0"
-#     Col 28-30: Reserved
-#     Col 31: Stratified 1st & 2nd variate presence count
-#     Col 32: Stratified regression slope (2nd variate vs. 1st variate)
-#     Col 33: Stratified regression slope standard deviation
-#     Col 34: Stratified correlation = +/- sqrt(R-squared)
-#     Col 35: Stratified residual standard deviation
-#     Col 36: Stratified R-squared
-#     Col 37: Stratified P-value for hypothesis "slope = 0"
-#     Col 38: Number of strata with at least two counted points
-#     Col 39-40: Reserved
-#     TO DO: GOODNESS OF FIT MEASURE
-#
-# EXAMPLE:
-# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx
-
-NaN = 0/0;
-
-print ("BEGIN STRATIFIED STATISTICS SCRIPT");
-
-print ("Reading the input matrices...");
-
-DataWithNaNs = read ($1, format = "text");
-Xcols = read ($3, format = "text");
-Ycols = read ($4, format = "text");
-stratum_column_id = $2;
-num_records  = nrow(DataWithNaNs);
-num_attrs    = ncol(DataWithNaNs);
-num_attrs_X  = ncol(Xcols);
-num_attrs_Y  = ncol(Ycols);
-num_attrs_XY = num_attrs_X * num_attrs_Y;
-
-
-print ("Preparing the variates...");
-
-Data = deNaN (DataWithNaNs);
-DataNaNmask = ppred (DataWithNaNs, NaN, "==");
-
-tXcols = t(Xcols);
-ones = matrix (1.0, rows = num_attrs_X, cols = 1);
-one_to_num_attrs_X = sumup (ones);
-ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X);
-ProjX_ctable = table (tXcols, one_to_num_attrs_X);
-ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable;
-X = Data %*% ProjX;
-X_mask = 1 - (DataNaNmask %*% ProjX);
-
-tYcols = t(Ycols);
-ones = matrix (1.0, rows = num_attrs_Y, cols = 1);
-one_to_num_attrs_Y = sumup (ones);
-ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y);
-ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
-ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable;
-Y = Data %*% ProjY;
-Y_mask = 1 - (DataNaNmask %*% ProjY);
-
-
-print ("Preparing the strata...");
-
-Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]);
-Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows");
-vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id]));
-vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs));
-num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs);
-num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata);
-ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1);
-one_to_num_records_with_nonNaN_strata = sumup (ones);
-StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata);
-StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows");
-StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata;
-num_strata = nrow (StrataSummator);
-num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata;
-print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata.");
-
-print ("Computing the global single-variate statistics...");
-
-cnt_X_global = colSums (X_mask);
-cnt_Y_global = colSums (Y_mask);
-avg_X_global = colSums (X) / cnt_X_global;
-avg_Y_global = colSums (Y) / cnt_Y_global;
-var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
-var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
-                 sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
-stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
-                 sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
-stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
-
-print ("Computing the stratified single-variate statistics...");
-
-# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
-
-Cnt_X_per_stratum = StrataSummator %*% X_mask;
-Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
-Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "==");
-Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "==");
-One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
-One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
-num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
-num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
-
-Sum_X_per_stratum  = StrataSummator %*% X;
-Sum_Y_per_stratum  = StrataSummator %*% Y;
-
-# Recompute some global statistics to exclude NaN-stratum records
-
-cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum);
-cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum);
-sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum);
-sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum);
-var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata;
-var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata;
-
-# Compute the stratified statistics
-
-var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
-var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
-                      sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
-stdev_X_stratified  = sqrt_failsafe (sqrt_failsafe_input_3);
-                      sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
-stdev_Y_stratified  = sqrt_failsafe (sqrt_failsafe_input_4);
-r_sqr_X_vs_strata   = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata;
-r_sqr_Y_vs_strata   = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata;
-fStat_X_vs_strata   = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata));
-fStat_Y_vs_strata   = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata));
-p_val_X_vs_strata   = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
-p_val_Y_vs_strata   = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
-
-print ("Computing the global bivariate statistics...");
-
-# Compute the aggregate X vs. Y statistics and map them into proper positions
-
-cnt_XY_rectangle       = t(X_mask) %*% Y_mask;
-sum_X_forXY_rectangle  = t(X)      %*% Y_mask;
-sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask;
-sum_Y_forXY_rectangle  = t(X_mask) %*% Y;
-sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
-sum_XY_rectangle       = t(X)      %*% Y;
-cnt_XY_global       = matrix (cnt_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_X_forXY_global  = matrix (sum_X_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_Y_forXY_global  = matrix (sum_Y_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_XY_global       = matrix (sum_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
-ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
-
-# Compute the global bivariate statistics for output
-
-cov_sumX_sumY_global   = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
-var_sumX_forXY_global  = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
-var_sumY_forXY_global  = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
-slope_XY_global        = cov_sumX_sumY_global / var_sumX_forXY_global;
-                                                sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
-                                                sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
-corr_XY_global         = cov_sumX_sumY_global / sqrt_failsafe_output_5;
-r_sqr_X_vs_Y_global    = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
-                         sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
-stdev_slope_XY_global  = sqrt_failsafe (sqrt_failsafe_input_6);
-                         sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
-stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
-fStat_Y_vs_X_global    = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
-p_val_Y_vs_X_global    = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
-
-print ("Computing the stratified bivariate statistics...");
-
-# Create projections to "intermingle" X and Y into attribute pairs
-
-Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
-Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
-ones_Y_row   = matrix (1.0, rows = 1, cols = num_attrs_Y);
-for (i in 1:num_attrs_X) {
-    start_cid = (i - 1) * num_attrs_Y + 1;
-    end_cid = i * num_attrs_Y;
-    Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row;
-    Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row);
-}
-
-# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
-
-Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
-Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
-Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
-
-Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "==");
-One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
-num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
-
-# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records
-
-cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum);
-sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum);
-sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum);
-sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum);
-
-# Compute the stratified bivariate statistics
-
-var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-cov_sumX_sumY_stratified  = sum_XY_excluding_NaNstrata       - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-
-slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
-                                                     sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
-                                                     sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
-corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
-r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
-r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified;
-                             sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-stdev_slope_XY_stratified  = sqrt_failsafe (sqrt_failsafe_input_9);
-                             sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
-fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
-p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-
-print ("Preparing the output matrix...");
-OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
-
-OutMtx [ 1, ] = Xcols              %*% Proj_X_to_XY;  # 1st variate column number
-OutMtx [ 2, ] = cnt_X_global       %*% Proj_X_to_XY;  # 1st variate global presence count
-OutMtx [ 3, ] = avg_X_global       %*% Proj_X_to_XY;  # 1st variate global mean
-OutMtx [ 4, ] = stdev_X_global     %*% Proj_X_to_XY;  # 1st variate global standard deviation
-OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY;  # 1st variate stratified standard deviation
-OutMtx [ 6, ] = r_sqr_X_vs_strata  %*% Proj_X_to_XY;  # R-squared, 1st variate vs. strata
-OutMtx [ 7, ] = p_val_X_vs_strata  %*% Proj_X_to_XY;  # P-value, 1st variate vs. strata
-OutMtx [11, ] = Ycols              %*% Proj_Y_to_XY;  # 2nd variate column number
-OutMtx [12, ] = cnt_Y_global       %*% Proj_Y_to_XY;  # 2nd variate global presence count
-OutMtx [13, ] = avg_Y_global       %*% Proj_Y_to_XY;  # 2nd variate global mean
-OutMtx [14, ] = stdev_Y_global     %*% Proj_Y_to_XY;  # 2nd variate global standard deviation
-OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY;  # 2nd variate stratified standard deviation
-OutMtx [16, ] = r_sqr_Y_vs_strata  %*% Proj_Y_to_XY;  # R-squared, 2nd variate vs. strata
-OutMtx [17, ] = p_val_Y_vs_strata  %*% Proj_Y_to_XY;  # P-value, 2nd variate vs. strata
-
-
-OutMtx [21, ] = cnt_XY_global;              # Global 1st & 2nd variate presence count
-OutMtx [22, ] = slope_XY_global;            # Global regression slope (2nd variate vs. 1st variate)
-OutMtx [23, ] = stdev_slope_XY_global;      # Global regression slope standard deviation
-OutMtx [24, ] = corr_XY_global;             # Global correlation = +/- sqrt(R-squared)
-OutMtx [25, ] = stdev_errY_vs_X_global;     # Global residual standard deviation
-OutMtx [26, ] = r_sqr_X_vs_Y_global;        # Global R-squared
-OutMtx [27, ] = p_val_Y_vs_X_global;        # Global P-value for hypothesis "slope = 0"
-OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count
-OutMtx [32, ] = slope_XY_stratified;        # Stratified regression slope (2nd variate vs. 1st variate)
-OutMtx [33, ] = stdev_slope_XY_stratified;  # Stratified regression slope standard deviation
-OutMtx [34, ] = corr_XY_stratified;         # Stratified correlation = +/- sqrt(R-squared)
-OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
-OutMtx [36, ] = r_sqr_X_vs_Y_stratified;    # Stratified R-squared
-OutMtx [37, ] = p_val_Y_vs_X_stratified;    # Stratified P-value for hypothesis "slope = 0"
-OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">="));  # Number of strata with at least two counted points
-
-OutMtx = t(OutMtx);
-
-print ("Writing the output matrix...");
-write (OutMtx, $5, format="text");
-print ("END STRATIFIED STATISTICS SCRIPT");
-
-
-deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B)
-        implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem");
-
-fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
-{ # TEMPORARY IMPLEMENTATION
-    tailprob = fStat;
-    for (i in 1:nrow(fStat)) {
-      for (j in 1:ncol(fStat)) {
-        q = castAsScalar (fStat [i, j]);
-        d1 = castAsScalar (df_1 [i, j]);
-        d2 = castAsScalar (df_2 [i, j]);
-        if (d1 >= 1 & d2 >= 1 & q >= 0.0) {
-            tailprob  [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
-        } else {
-            tailprob  [i, j] = 0/0;
-        }
-    } }
-}
-
-sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
-{
-    NaN = 0/0;
-    mask_A = ppred (input_A, 0.0, ">=");
-    prep_A = input_A * mask_A;
-    mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "=="));
-    prep_A = deNaN (prep_A);
-    output_A = sqrt (prep_A) / mask_A;
-}
-
-sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
-{
-    shift = 1;
-    m_A = nrow(A);
-    sum_A = A;
-    while (shift < m_A) {
-        sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
-        shift = 2 * shift;
-    } 
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# STRATIFIED BIVARIATE STATISTICS, VERSION 2
+# 
+# INPUT  1: Dataset with records as rows (matrix filename)
+# INPUT  2: The stratum ID column number (integer)
+#   Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive.
+# INPUT  3: 1st variate column numbers (matrix filename)
+# INPUT  4: 2nd variate column numbers (matrix filename)
+# INPUT  5: Output (matrix filename)
+#
+# OUTPUT 1: Output Matrix with 40 columns, containing the following information:
+#     Rows: One row per each distinct pair (1st variate, 2nd variate)
+#     Col 01: 1st variate column number
+#     Col 02: 1st variate global presence count
+#     Col 03: 1st variate global mean
+#     Col 04: 1st variate global standard deviation
+#     Col 05: 1st variate stratified standard deviation
+#     Col 06: R-squared, 1st variate vs. strata
+#     Col 07: P-value, 1st variate vs. strata
+#     Col 08-10: Reserved
+#     Col 11: 2nd variate column number
+#     Col 12: 2nd variate global presence count
+#     Col 13: 2nd variate global mean
+#     Col 14: 2nd variate global standard deviation
+#     Col 15: 2nd variate stratified standard deviation
+#     Col 16: R-squared, 2nd variate vs. strata
+#     Col 17: P-value, 2nd variate vs. strata
+#     Col 18-20: Reserved
+#     Col 21: Global 1st & 2nd variate presence count
+#     Col 22: Global regression slope (2nd variate vs. 1st variate)
+#     Col 23: Global regression slope standard deviation
+#     Col 24: Global correlation = +/- sqrt(R-squared)
+#     Col 25: Global residual standard deviation
+#     Col 26: Global R-squared
+#     Col 27: Global P-value for hypothesis "slope = 0"
+#     Col 28-30: Reserved
+#     Col 31: Stratified 1st & 2nd variate presence count
+#     Col 32: Stratified regression slope (2nd variate vs. 1st variate)
+#     Col 33: Stratified regression slope standard deviation
+#     Col 34: Stratified correlation = +/- sqrt(R-squared)
+#     Col 35: Stratified residual standard deviation
+#     Col 36: Stratified R-squared
+#     Col 37: Stratified P-value for hypothesis "slope = 0"
+#     Col 38: Number of strata with at least two counted points
+#     Col 39-40: Reserved
+#     TO DO: GOODNESS OF FIT MEASURE
+#
+# EXAMPLE:
+# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx
+
+NaN = 0/0;
+
+print ("BEGIN STRATIFIED STATISTICS SCRIPT");
+
+print ("Reading the input matrices...");
+
+DataWithNaNs = read ($1, format = "text");
+Xcols = read ($3, format = "text");
+Ycols = read ($4, format = "text");
+stratum_column_id = $2;
+num_records  = nrow(DataWithNaNs);
+num_attrs    = ncol(DataWithNaNs);
+num_attrs_X  = ncol(Xcols);
+num_attrs_Y  = ncol(Ycols);
+num_attrs_XY = num_attrs_X * num_attrs_Y;
+
+
+print ("Preparing the variates...");
+
+Data = deNaN (DataWithNaNs);
+DataNaNmask = ppred (DataWithNaNs, NaN, "==");
+
+tXcols = t(Xcols);
+ones = matrix (1.0, rows = num_attrs_X, cols = 1);
+one_to_num_attrs_X = sumup (ones);
+ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X);
+ProjX_ctable = table (tXcols, one_to_num_attrs_X);
+ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable;
+X = Data %*% ProjX;
+X_mask = 1 - (DataNaNmask %*% ProjX);
+
+tYcols = t(Ycols);
+ones = matrix (1.0, rows = num_attrs_Y, cols = 1);
+one_to_num_attrs_Y = sumup (ones);
+ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y);
+ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
+ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable;
+Y = Data %*% ProjY;
+Y_mask = 1 - (DataNaNmask %*% ProjY);
+
+
+print ("Preparing the strata...");
+
+Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]);
+Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows");
+vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id]));
+vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs));
+num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs);
+num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata);
+ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1);
+one_to_num_records_with_nonNaN_strata = sumup (ones);
+StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata);
+StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows");
+StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata;
+num_strata = nrow (StrataSummator);
+num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata;
+print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata.");
+
+print ("Computing the global single-variate statistics...");
+
+cnt_X_global = colSums (X_mask);
+cnt_Y_global = colSums (Y_mask);
+avg_X_global = colSums (X) / cnt_X_global;
+avg_Y_global = colSums (Y) / cnt_Y_global;
+var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
+var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
+                 sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
+stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
+                 sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
+stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
+
+print ("Computing the stratified single-variate statistics...");
+
+# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
+
+Cnt_X_per_stratum = StrataSummator %*% X_mask;
+Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
+Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "==");
+Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "==");
+One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
+One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
+num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
+num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
+
+Sum_X_per_stratum  = StrataSummator %*% X;
+Sum_Y_per_stratum  = StrataSummator %*% Y;
+
+# Recompute some global statistics to exclude NaN-stratum records
+
+cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum);
+cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum);
+sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum);
+sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum);
+var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata;
+var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata;
+
+# Compute the stratified statistics
+
+var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
+var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
+                      sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
+stdev_X_stratified  = sqrt_failsafe (sqrt_failsafe_input_3);
+                      sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
+stdev_Y_stratified  = sqrt_failsafe (sqrt_failsafe_input_4);
+r_sqr_X_vs_strata   = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata;
+r_sqr_Y_vs_strata   = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata;
+fStat_X_vs_strata   = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata));
+fStat_Y_vs_strata   = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata));
+p_val_X_vs_strata   = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
+p_val_Y_vs_strata   = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
+
+print ("Computing the global bivariate statistics...");
+
+# Compute the aggregate X vs. Y statistics and map them into proper positions
+
+cnt_XY_rectangle       = t(X_mask) %*% Y_mask;
+sum_X_forXY_rectangle  = t(X)      %*% Y_mask;
+sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask;
+sum_Y_forXY_rectangle  = t(X_mask) %*% Y;
+sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
+sum_XY_rectangle       = t(X)      %*% Y;
+cnt_XY_global       = matrix (cnt_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_X_forXY_global  = matrix (sum_X_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_Y_forXY_global  = matrix (sum_Y_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_XY_global       = matrix (sum_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
+
+# Compute the global bivariate statistics for output
+
+cov_sumX_sumY_global   = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+var_sumX_forXY_global  = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
+var_sumY_forXY_global  = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+slope_XY_global        = cov_sumX_sumY_global / var_sumX_forXY_global;
+                                                sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
+                                                sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
+corr_XY_global         = cov_sumX_sumY_global / sqrt_failsafe_output_5;
+r_sqr_X_vs_Y_global    = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
+                         sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
+stdev_slope_XY_global  = sqrt_failsafe (sqrt_failsafe_input_6);
+                         sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
+stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
+fStat_Y_vs_X_global    = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
+p_val_Y_vs_X_global    = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
+
+print ("Computing the stratified bivariate statistics...");
+
+# Create projections to "intermingle" X and Y into attribute pairs
+
+Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
+Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
+ones_Y_row   = matrix (1.0, rows = 1, cols = num_attrs_Y);
+for (i in 1:num_attrs_X) {
+    start_cid = (i - 1) * num_attrs_Y + 1;
+    end_cid = i * num_attrs_Y;
+    Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row;
+    Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row);
+}
+
+# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
+
+Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
+Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
+Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
+
+Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "==");
+One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
+num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
+
+# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records
+
+cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum);
+sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum);
+sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum);
+sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum);
+
+# Compute the stratified bivariate statistics
+
+var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+cov_sumX_sumY_stratified  = sum_XY_excluding_NaNstrata       - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+
+slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
+                                                     sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
+                                                     sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
+corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
+r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
+r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified;
+                             sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+stdev_slope_XY_stratified  = sqrt_failsafe (sqrt_failsafe_input_9);
+                             sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
+fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
+p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+
+print ("Preparing the output matrix...");
+OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
+
+OutMtx [ 1, ] = Xcols              %*% Proj_X_to_XY;  # 1st variate column number
+OutMtx [ 2, ] = cnt_X_global       %*% Proj_X_to_XY;  # 1st variate global presence count
+OutMtx [ 3, ] = avg_X_global       %*% Proj_X_to_XY;  # 1st variate global mean
+OutMtx [ 4, ] = stdev_X_global     %*% Proj_X_to_XY;  # 1st variate global standard deviation
+OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY;  # 1st variate stratified standard deviation
+OutMtx [ 6, ] = r_sqr_X_vs_strata  %*% Proj_X_to_XY;  # R-squared, 1st variate vs. strata
+OutMtx [ 7, ] = p_val_X_vs_strata  %*% Proj_X_to_XY;  # P-value, 1st variate vs. strata
+OutMtx [11, ] = Ycols              %*% Proj_Y_to_XY;  # 2nd variate column number
+OutMtx [12, ] = cnt_Y_global       %*% Proj_Y_to_XY;  # 2nd variate global presence count
+OutMtx [13, ] = avg_Y_global       %*% Proj_Y_to_XY;  # 2nd variate global mean
+OutMtx [14, ] = stdev_Y_global     %*% Proj_Y_to_XY;  # 2nd variate global standard deviation
+OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY;  # 2nd variate stratified standard deviation
+OutMtx [16, ] = r_sqr_Y_vs_strata  %*% Proj_Y_to_XY;  # R-squared, 2nd variate vs. strata
+OutMtx [17, ] = p_val_Y_vs_strata  %*% Proj_Y_to_XY;  # P-value, 2nd variate vs. strata
+
+
+OutMtx [21, ] = cnt_XY_global;              # Global 1st & 2nd variate presence count
+OutMtx [22, ] = slope_XY_global;            # Global regression slope (2nd variate vs. 1st variate)
+OutMtx [23, ] = stdev_slope_XY_global;      # Global regression slope standard deviation
+OutMtx [24, ] = corr_XY_global;             # Global correlation = +/- sqrt(R-squared)
+OutMtx [25, ] = stdev_errY_vs_X_global;     # Global residual standard deviation
+OutMtx [26, ] = r_sqr_X_vs_Y_global;        # Global R-squared
+OutMtx [27, ] = p_val_Y_vs_X_global;        # Global P-value for hypothesis "slope = 0"
+OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count
+OutMtx [32, ] = slope_XY_stratified;        # Stratified regression slope (2nd variate vs. 1st variate)
+OutMtx [33, ] = stdev_slope_XY_stratified;  # Stratified regression slope standard deviation
+OutMtx [34, ] = corr_XY_stratified;         # Stratified correlation = +/- sqrt(R-squared)
+OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
+OutMtx [36, ] = r_sqr_X_vs_Y_stratified;    # Stratified R-squared
+OutMtx [37, ] = p_val_Y_vs_X_stratified;    # Stratified P-value for hypothesis "slope = 0"
+OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">="));  # Number of strata with at least two counted points
+
+OutMtx = t(OutMtx);
+
+print ("Writing the output matrix...");
+write (OutMtx, $5, format="text");
+print ("END STRATIFIED STATISTICS SCRIPT");
+
+
+deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B)
+        implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem");
+
+fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
+{ # TEMPORARY IMPLEMENTATION
+    tailprob = fStat;
+    for (i in 1:nrow(fStat)) {
+      for (j in 1:ncol(fStat)) {
+        q = castAsScalar (fStat [i, j]);
+        d1 = castAsScalar (df_1 [i, j]);
+        d2 = castAsScalar (df_2 [i, j]);
+        if (d1 >= 1 & d2 >= 1 & q >= 0.0) {
+            tailprob  [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
+        } else {
+            tailprob  [i, j] = 0/0;
+        }
+    } }
+}
+
+sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
+{
+    NaN = 0/0;
+    mask_A = ppred (input_A, 0.0, ">=");
+    prep_A = input_A * mask_A;
+    mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "=="));
+    prep_A = deNaN (prep_A);
+    output_A = sqrt (prep_A) / mask_A;
+}
+
+sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
+{
+    shift = 1;
+    m_A = nrow(A);
+    sum_A = A;
+    while (shift < m_A) {
+        sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
+        shift = 2 * shift;
+    } 
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/wilson_score.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/wilson_score.dml b/src/test/scripts/applications/ctableStats/wilson_score.dml
index 90db3fc..27d0899 100644
--- a/src/test/scripts/applications/ctableStats/wilson_score.dml
+++ b/src/test/scripts/applications/ctableStats/wilson_score.dml
@@ -1,145 +1,145 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score
-# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n
-# INPUT 2: The number of rows
-# INPUT 3: The output file
-# OUTPUT : Matrix [rows, 15] of doubles, containing the following information:
-#     (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, 
-#      n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, 
-#      m / n,      Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right)
-# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION!
-# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx"
-
-setwd ("test/scripts/applications/ctableStats");
-source ("Binomial.dml");
-
-# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6);
-# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0);
-# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421);
-# [alpha] = binomProb (test_n, test_m, test_p);
-# print ("TEST:  Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "]  =  " + castAsScalar (alpha));
-
-print ("BEGIN WILSON SCORE SCRIPT");
-print ("Reading X...");
-X = read ($1, rows = $2, cols = 2, format = "text");
-num_rows = $2;
-print ("Performing the computation...");
-
-M = X [, 1];
-N = X [, 2];
-blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0);
-sum_M = blahh * sum(M);
-sum_N = blahh * sum(N);
-
-[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M);
-[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N);
-[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M);
-
-M_minus_1 = M - 1;
-N_minus_1 = N - 1;
-big_alpha   = 0.975 * blahh;
-small_alpha = 0.025 * blahh;
-
-[l_m_sum_exact]   = binomQuantile (sum_M, M_minus_1, big_alpha);
-[r_m_sum_exact]   = binomQuantile (sum_M, M, small_alpha);
-[l_n_sum_exact]   = binomQuantile (sum_N, N_minus_1, big_alpha);
-[r_n_sum_exact]   = binomQuantile (sum_N, N, small_alpha);
-[l_m_n_exact]     = binomQuantile (N, M_minus_1, big_alpha);
-[r_m_n_exact]     = binomQuantile (N, M, small_alpha);
-
-result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0);
-result [,  1] = p_m_sum;
-result [,  2] = l_m_sum_wilson;
-result [,  3] = r_m_sum_wilson;
-result [,  4] = l_m_sum_exact;
-result [,  5] = r_m_sum_exact;
-result [,  6] = p_n_sum;
-result [,  7] = l_n_sum_wilson;
-result [,  8] = r_n_sum_wilson;
-result [,  9] = l_n_sum_exact;
-result [, 10] = r_n_sum_exact;
-result [, 11] = p_m_n;
-result [, 12] = l_m_n_wilson;
-result [, 13] = r_m_n_wilson;
-result [, 14] = l_m_n_exact;
-result [, 15] = r_m_n_exact;
-
-print ("M / sum(M)  RESULTS:  Wilson, Exact");
-
-for (i in 1:num_rows) {
-    p1  = castAsScalar (round (result [i,  1] * 100000) / 1000);
-    lw1 = castAsScalar (round (result [i,  2] * 100000) / 1000);
-    rw1 = castAsScalar (round (result [i,  3] * 100000) / 1000);
-    le1 = castAsScalar (round (result [i,  4] * 100000) / 1000);
-    re1 = castAsScalar (round (result [i,  5] * 100000) / 1000);
-    print ("Row " + i + ":   "
-        + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = " 
-        + p1 + "%  [" + lw1 + "%, " + rw1 + "%]   [" + le1 + "%, " + re1 + "%]");
-}
-
-print ("N / sum(N)  RESULTS:  Wilson, Exact");
-
-for (i in 1:num_rows) {
-    p2  = castAsScalar (round (result [i,  6] * 100000) / 1000);
-    lw2 = castAsScalar (round (result [i,  7] * 100000) / 1000);
-    rw2 = castAsScalar (round (result [i,  8] * 100000) / 1000);
-    le2 = castAsScalar (round (result [i,  9] * 100000) / 1000);
-    re2 = castAsScalar (round (result [i, 10] * 100000) / 1000);
-    print ("Row " + i + ":   "
-        + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = " 
-        + p2 + "%  [" + lw2 + "%, " + rw2 + "%]   [" + le2 + "%, " + re2 + "%]   ");
-}
-
-print ("M / N  RESULTS:  Wilson, Exact");
-
-for (i in 1:num_rows) {
-    p3  = castAsScalar (round (result [i, 11] * 100000) / 1000);
-    lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000);
-    rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000);
-    le3 = castAsScalar (round (result [i, 14] * 100000) / 1000);
-    re3 = castAsScalar (round (result [i, 15] * 100000) / 1000);
-    print ("Row " + i + ":   "
-        + castAsScalar (M [i, 1]) + "/" + castAsScalar (    N [i, 1]) + " = " 
-        + p3 + "%  [" + lw3 + "%, " + rw3 + "%]   [" + le3 + "%, " + re3 + "%]   ");
-}
-
-
-
-print ("Writing the results...");
-write (result, $3, format = "text");
-print ("END WILSON SCORE SCRIPT");
-
-
-wilson_confidence = function (Matrix[double] n, Matrix[double] m)
-return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right)
-{
-    z = 1.96;      # 97.5% normal percentile, for 95% confidence interval
-    z_sq_n = z * z * n;
-    qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4));
-    midpt = n * m + z_sq_n / 2;
-    denom = n * n + z_sq_n;
-    ratio = m / n;
-    conf_left  = (midpt - qroot) / denom;
-    conf_right = (midpt + qroot) / denom;
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score
+# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n
+# INPUT 2: The number of rows
+# INPUT 3: The output file
+# OUTPUT : Matrix [rows, 15] of doubles, containing the following information:
+#     (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, 
+#      n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, 
+#      m / n,      Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right)
+# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION!
+# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx"
+
+setwd ("test/scripts/applications/ctableStats");
+source ("Binomial.dml");
+
+# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6);
+# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0);
+# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421);
+# [alpha] = binomProb (test_n, test_m, test_p);
+# print ("TEST:  Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "]  =  " + castAsScalar (alpha));
+
+print ("BEGIN WILSON SCORE SCRIPT");
+print ("Reading X...");
+X = read ($1, rows = $2, cols = 2, format = "text");
+num_rows = $2;
+print ("Performing the computation...");
+
+M = X [, 1];
+N = X [, 2];
+blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0);
+sum_M = blahh * sum(M);
+sum_N = blahh * sum(N);
+
+[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M);
+[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N);
+[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M);
+
+M_minus_1 = M - 1;
+N_minus_1 = N - 1;
+big_alpha   = 0.975 * blahh;
+small_alpha = 0.025 * blahh;
+
+[l_m_sum_exact]   = binomQuantile (sum_M, M_minus_1, big_alpha);
+[r_m_sum_exact]   = binomQuantile (sum_M, M, small_alpha);
+[l_n_sum_exact]   = binomQuantile (sum_N, N_minus_1, big_alpha);
+[r_n_sum_exact]   = binomQuantile (sum_N, N, small_alpha);
+[l_m_n_exact]     = binomQuantile (N, M_minus_1, big_alpha);
+[r_m_n_exact]     = binomQuantile (N, M, small_alpha);
+
+result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0);
+result [,  1] = p_m_sum;
+result [,  2] = l_m_sum_wilson;
+result [,  3] = r_m_sum_wilson;
+result [,  4] = l_m_sum_exact;
+result [,  5] = r_m_sum_exact;
+result [,  6] = p_n_sum;
+result [,  7] = l_n_sum_wilson;
+result [,  8] = r_n_sum_wilson;
+result [,  9] = l_n_sum_exact;
+result [, 10] = r_n_sum_exact;
+result [, 11] = p_m_n;
+result [, 12] = l_m_n_wilson;
+result [, 13] = r_m_n_wilson;
+result [, 14] = l_m_n_exact;
+result [, 15] = r_m_n_exact;
+
+print ("M / sum(M)  RESULTS:  Wilson, Exact");
+
+for (i in 1:num_rows) {
+    p1  = castAsScalar (round (result [i,  1] * 100000) / 1000);
+    lw1 = castAsScalar (round (result [i,  2] * 100000) / 1000);
+    rw1 = castAsScalar (round (result [i,  3] * 100000) / 1000);
+    le1 = castAsScalar (round (result [i,  4] * 100000) / 1000);
+    re1 = castAsScalar (round (result [i,  5] * 100000) / 1000);
+    print ("Row " + i + ":   "
+        + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = " 
+        + p1 + "%  [" + lw1 + "%, " + rw1 + "%]   [" + le1 + "%, " + re1 + "%]");
+}
+
+print ("N / sum(N)  RESULTS:  Wilson, Exact");
+
+for (i in 1:num_rows) {
+    p2  = castAsScalar (round (result [i,  6] * 100000) / 1000);
+    lw2 = castAsScalar (round (result [i,  7] * 100000) / 1000);
+    rw2 = castAsScalar (round (result [i,  8] * 100000) / 1000);
+    le2 = castAsScalar (round (result [i,  9] * 100000) / 1000);
+    re2 = castAsScalar (round (result [i, 10] * 100000) / 1000);
+    print ("Row " + i + ":   "
+        + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = " 
+        + p2 + "%  [" + lw2 + "%, " + rw2 + "%]   [" + le2 + "%, " + re2 + "%]   ");
+}
+
+print ("M / N  RESULTS:  Wilson, Exact");
+
+for (i in 1:num_rows) {
+    p3  = castAsScalar (round (result [i, 11] * 100000) / 1000);
+    lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000);
+    rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000);
+    le3 = castAsScalar (round (result [i, 14] * 100000) / 1000);
+    re3 = castAsScalar (round (result [i, 15] * 100000) / 1000);
+    print ("Row " + i + ":   "
+        + castAsScalar (M [i, 1]) + "/" + castAsScalar (    N [i, 1]) + " = " 
+        + p3 + "%  [" + lw3 + "%, " + rw3 + "%]   [" + le3 + "%, " + re3 + "%]   ");
+}
+
+
+
+print ("Writing the results...");
+write (result, $3, format = "text");
+print ("END WILSON SCORE SCRIPT");
+
+
+wilson_confidence = function (Matrix[double] n, Matrix[double] m)
+return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right)
+{
+    z = 1.96;      # 97.5% normal percentile, for 95% confidence interval
+    z_sq_n = z * z * n;
+    qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4));
+    midpt = n * m + z_sq_n / 2;
+    denom = n * n + z_sq_n;
+    ratio = m / n;
+    conf_left  = (midpt - qroot) / denom;
+    conf_right = (midpt + qroot) / denom;
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/zipftest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/zipftest.dml b/src/test/scripts/applications/ctableStats/zipftest.dml
index 08fe217..50ac5f4 100644
--- a/src/test/scripts/applications/ctableStats/zipftest.dml
+++ b/src/test/scripts/applications/ctableStats/zipftest.dml
@@ -1,77 +1,77 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Generator of random records with boolean features
-# Average record (row) and feature (column) densities follow
-#   power laws:  E(#1s in line k) = const / (k + add)^pow
-# Cell[1, 1] has the highest probability to be 1, also input 
-
-# By setting num_features >> num_records we allow lots of rare
-# features while keeping most records nonempty.
-# The power ("pow") in the power law determines the tail behavior;
-# The additive ("add") determines how steeply the density changes
-#   in the first few records or features.
-
-num_records = 1000;    # The number of records (rows)
-num_features = 50000;  # The number of boolean features (columns)
-
-pow_records = 2.0;     # The Zipf law power for record  density
-pow_features = 1.0;    # The Zipf law power for feature density
-
-add_records = 100.0;   # The additive shift for record  density
-add_features = 20.0;   # The additive shift for feature density
-
-max_cell_prob = 1.0;   # The probability for Cell[1, 1] to be 1
-
-############
-
-c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features);
-
-vec_records = matrix (1.0, rows = num_records, cols = 1);
-vec_records = sumup (vec_records);
-vec_records = 1.0 / ((vec_records + add_records)^pow_records);
-
-vec_features = matrix (1.0, rows = num_features, cols = 1);
-vec_features = sumup (vec_features);
-vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features);
-
-Probs = c * (vec_records %*% vec_features);
-avg_density_records = rowSums (Probs);
-avg_density_features = colSums (Probs);
-
-Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0);
-Data = ppred (Tosses, Probs, "<=");
-
-write (avg_density_records,  "Zipf.AvgDensity.Rows", format="text");
-write (avg_density_features, "Zipf.AvgDensity.Cols", format="text");
-write (Data, "Zipf.Data", format="text");
-
-
-sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
-{
-    shift = 1;
-    m_A = nrow(A);
-    sum_A = A;
-    while (shift < m_A) {
-        sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
-        shift = 2 * shift;
-    } 
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Generator of random records with boolean features
+# Average record (row) and feature (column) densities follow
+#   power laws:  E(#1s in line k) = const / (k + add)^pow
+# Cell[1, 1] has the highest probability to be 1, also input 
+
+# By setting num_features >> num_records we allow lots of rare
+# features while keeping most records nonempty.
+# The power ("pow") in the power law determines the tail behavior;
+# The additive ("add") determines how steeply the density changes
+#   in the first few records or features.
+
+num_records = 1000;    # The number of records (rows)
+num_features = 50000;  # The number of boolean features (columns)
+
+pow_records = 2.0;     # The Zipf law power for record  density
+pow_features = 1.0;    # The Zipf law power for feature density
+
+add_records = 100.0;   # The additive shift for record  density
+add_features = 20.0;   # The additive shift for feature density
+
+max_cell_prob = 1.0;   # The probability for Cell[1, 1] to be 1
+
+############
+
+c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features);
+
+vec_records = matrix (1.0, rows = num_records, cols = 1);
+vec_records = sumup (vec_records);
+vec_records = 1.0 / ((vec_records + add_records)^pow_records);
+
+vec_features = matrix (1.0, rows = num_features, cols = 1);
+vec_features = sumup (vec_features);
+vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features);
+
+Probs = c * (vec_records %*% vec_features);
+avg_density_records = rowSums (Probs);
+avg_density_features = colSums (Probs);
+
+Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0);
+Data = ppred (Tosses, Probs, "<=");
+
+write (avg_density_records,  "Zipf.AvgDensity.Rows", format="text");
+write (avg_density_features, "Zipf.AvgDensity.Cols", format="text");
+write (Data, "Zipf.Data", format="text");
+
+
+sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
+{
+    shift = 1;
+    m_A = nrow(A);
+    sum_A = A;
+    while (shift < m_A) {
+        sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
+        shift = 2 * shift;
+    } 
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/Categorical.R b/src/test/scripts/applications/descriptivestats/Categorical.R
index 46c380d..f88c8bf 100644
--- a/src/test/scripts/applications/descriptivestats/Categorical.R
+++ b/src/test/scripts/applications/descriptivestats/Categorical.R
@@ -1,57 +1,57 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java
-# command line invocation assuming $C_HOME is set to the home of the R script
-# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/
-args <- commandArgs(TRUE)
-options(digits=22)
-
-#library("batch")
-library("Matrix")
-
-V = readMM(paste(args[1], "vector.mtx", sep=""))
-
-tab = table(V[,1])
-cat = t(as.numeric(names(tab)))
-Nc = t(as.vector(tab))
-
-# the number of categories of a categorical variable
-R = length(Nc)
-
-# total count
-s = sum(Nc)
-
-# percentage values of each categorical compare to the total case number
-Pc = Nc / s
-
-# all categorical values of a categorical variable
-C = (Nc > 0)
-
-# mode
-mx = max(Nc)
-Mode = (Nc == mx)
-
-writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text");
-write(R, paste(args[2], "R", sep=""));
-writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text");
-writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text");
-writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java
+# command line invocation assuming $C_HOME is set to the home of the R script
+# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/
+args <- commandArgs(TRUE)
+options(digits=22)
+
+#library("batch")
+library("Matrix")
+
+V = readMM(paste(args[1], "vector.mtx", sep=""))
+
+tab = table(V[,1])
+cat = t(as.numeric(names(tab)))
+Nc = t(as.vector(tab))
+
+# the number of categories of a categorical variable
+R = length(Nc)
+
+# total count
+s = sum(Nc)
+
+# percentage values of each categorical compare to the total case number
+Pc = Nc / s
+
+# all categorical values of a categorical variable
+C = (Nc > 0)
+
+# mode
+mx = max(Nc)
+Mode = (Nc == mx)
+
+writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text");
+write(R, paste(args[2], "R", sep=""));
+writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text");
+writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text");
+writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text");

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/Categorical.dml b/src/test/scripts/applications/descriptivestats/Categorical.dml
index 735f5d6..7599d06 100644
--- a/src/test/scripts/applications/descriptivestats/Categorical.dml
+++ b/src/test/scripts/applications/descriptivestats/Categorical.dml
@@ -1,55 +1,55 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script Categorical.dml?
-# Assume C_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for vector
-# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode"
-
-V = read($1, rows=$2, cols=1, format="text")
-
-# a set of number of values specify the number of cases of each categorical
-Nc = table(V,1);
-
-# the number of categories of a categorical variable
-R = nrow(Nc)
-
-# total count
-s = sum(Nc)
-
-# percentage values of each categorical compare to the total case number
-Pc = Nc / s
-
-# all categorical values of a categorical variable
-C = ppred(Nc, 0, ">")
-
-# mode
-mx = max(Nc)
-Mode =  ppred(Nc, mx, "==")
-
-write(Nc, $3, format="text")
-write(R, $4)
-write(Pc, $5, format="text")
-write(C, $6, format="text")
-write(Mode, $7, format="text")
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script Categorical.dml?
+# Assume C_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for vector
+# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode"
+
+V = read($1, rows=$2, cols=1, format="text")
+
+# a set of number of values specify the number of cases of each categorical
+Nc = table(V,1);
+
+# the number of categories of a categorical variable
+R = nrow(Nc)
+
+# total count
+s = sum(Nc)
+
+# percentage values of each categorical compare to the total case number
+Pc = Nc / s
+
+# all categorical values of a categorical variable
+C = ppred(Nc, 0, ">")
+
+# mode
+mx = max(Nc)
+Mode =  ppred(Nc, mx, "==")
+
+write(Nc, $3, format="text")
+write(R, $4)
+write(Pc, $5, format="text")
+write(C, $6, format="text")
+write(Mode, $7, format="text")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
index 7c9785c..24a391f 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
@@ -1,49 +1,49 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
-# command line invocation assuming $CC_HOME is set to the home of the R script
-# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/
-args <- commandArgs(TRUE)
-options(digits=22)
-
-library("Matrix")
-
-A = readMM(paste(args[1], "A.mtx", sep=""));
-B = readMM(paste(args[1], "B.mtx", sep=""));
-
-F = table(A[,1],B[,1]);
-
-# chisq.test returns a list containing statistic, p-value, etc.
-cst = chisq.test(F);
-
-# get the chi-squared coefficient from the list
-chi_squared = as.numeric(cst[1]);
-pValue = as.numeric(cst[3]);
-
-write(pValue, paste(args[2], "PValue", sep=""));
-
-q = min(dim(F));
-W = sum(F);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(cramers_v, paste(args[2], "CramersV", sep=""));
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
+# command line invocation assuming $CC_HOME is set to the home of the R script
+# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+A = readMM(paste(args[1], "A.mtx", sep=""));
+B = readMM(paste(args[1], "B.mtx", sep=""));
+
+F = table(A[,1],B[,1]);
+
+# chisq.test returns a list containing statistic, p-value, etc.
+cst = chisq.test(F);
+
+# get the chi-squared coefficient from the list
+chi_squared = as.numeric(cst[1]);
+pValue = as.numeric(cst[3]);
+
+write(pValue, paste(args[2], "PValue", sep=""));
+
+q = min(dim(F));
+W = sum(F);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(cramers_v, paste(args[2], "CramersV", sep=""));
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
index 626bfeb..4301f91 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
@@ -1,56 +1,56 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script CategoricalCategorical.dml?
-# Assume CC_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for both A and B
-# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
-
-A = read($1, rows=$2, cols=1, format="text");
-B = read($3, rows=$2, cols=1, format="text");
-
-# Contingency Table
-F = table(A,B);
-
-# Chi-Squared
-W = sum(F);
-r = rowSums(F);
-c = colSums(F);
-E = (r %*% c)/W;
-T = (F-E)^2/E;
-chi_squared = sum(T);
-
-# compute p-value
-degFreedom = (nrow(F)-1)*(ncol(F)-1);
-pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
-# Cramer's V
-R = nrow(F);
-C = ncol(F);
-q = min(R,C);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(pValue, $4);
-write(cramers_v, $5);
-
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script CategoricalCategorical.dml?
+# Assume CC_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for both A and B
+# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
+
+A = read($1, rows=$2, cols=1, format="text");
+B = read($3, rows=$2, cols=1, format="text");
+
+# Contingency Table
+F = table(A,B);
+
+# Chi-Squared
+W = sum(F);
+r = rowSums(F);
+c = colSums(F);
+E = (r %*% c)/W;
+T = (F-E)^2/E;
+chi_squared = sum(T);
+
+# compute p-value
+degFreedom = (nrow(F)-1)*(ncol(F)-1);
+pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+# Cramer's V
+R = nrow(F);
+C = ncol(F);
+q = min(R,C);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(pValue, $4);
+write(cramers_v, $5);
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
index 9e3f797..bacc7e3 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
@@ -1,67 +1,67 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
-# command line invocation assuming $CC_HOME is set to the home of the R script
-# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/
-# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R
-
-args <- commandArgs(TRUE)
-options(digits=22)
-
-library("Matrix")
-
-#parseCommandArgs()
-######################
-
-print(commandArgs(TRUE)[1])
-
-A = readMM(paste(args[1], "A.mtx", sep=""));
-B = readMM(paste(args[1], "B.mtx", sep=""));
-WM = readMM(paste(args[1], "WM.mtx", sep=""));
-
-Av = A[,1];
-Bv = B[,1];
-WMv = WM[,1];
-
-# create a data frame with vectors A, B, WM
-df = data.frame(Av,Bv,WMv);
-
-# contingency table with weights
-F = xtabs ( WMv ~ Av + Bv, df);
-
-# chisq.test returns a list containing statistic, p-value, etc.
-cst = chisq.test(F);
-
-# get the chi-squared coefficient from the list
-chi_squared = as.numeric(cst[1]);
-pValue = as.numeric(cst[3]);
-
-write(pValue, paste(args[2], "PValue", sep=""));
-
-#######################
-
-q = min(dim(F));
-W = sum(F);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(cramers_v, paste(args[2], "CramersV", sep=""));
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
+# command line invocation assuming $CC_HOME is set to the home of the R script
+# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/
+# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+#parseCommandArgs()
+######################
+
+print(commandArgs(TRUE)[1])
+
+A = readMM(paste(args[1], "A.mtx", sep=""));
+B = readMM(paste(args[1], "B.mtx", sep=""));
+WM = readMM(paste(args[1], "WM.mtx", sep=""));
+
+Av = A[,1];
+Bv = B[,1];
+WMv = WM[,1];
+
+# create a data frame with vectors A, B, WM
+df = data.frame(Av,Bv,WMv);
+
+# contingency table with weights
+F = xtabs ( WMv ~ Av + Bv, df);
+
+# chisq.test returns a list containing statistic, p-value, etc.
+cst = chisq.test(F);
+
+# get the chi-squared coefficient from the list
+chi_squared = as.numeric(cst[1]);
+pValue = as.numeric(cst[3]);
+
+write(pValue, paste(args[2], "PValue", sep=""));
+
+#######################
+
+q = min(dim(F));
+W = sum(F);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(cramers_v, paste(args[2], "CramersV", sep=""));
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
index f92f42c..70e50f4 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
@@ -1,60 +1,60 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script CategoricalCategorical.dml?
-# Assume CC_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for both A and B
-# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
-
-# A <- nominal 
-# B <- nominal 
-# WM <- weights
-
-A = read($1, rows=$2, cols=1, format="text");
-B = read($3, rows=$2, cols=1, format="text");
-WM = read($4, rows=$2, cols=1, format="text");
-
-# Contingency Table
-F = table(A,B,WM);
-
-# Chi-Squared
-W = sum(F);
-r = rowSums(F);
-c = colSums(F);
-E = (r %*% c)/W;
-T = (F-E)^2/E;
-chi_squared = sum(T);
-
-# compute p-value
-degFreedom = (nrow(F)-1)*(ncol(F)-1);
-pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
-# Cramer's V
-R = nrow(F);
-C = ncol(F);
-q = min(R,C);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(pValue, $5);
-write(cramers_v, $6);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script CategoricalCategorical.dml?
+# Assume CC_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for both A and B
+# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
+
+# A <- nominal 
+# B <- nominal 
+# WM <- weights
+
+A = read($1, rows=$2, cols=1, format="text");
+B = read($3, rows=$2, cols=1, format="text");
+WM = read($4, rows=$2, cols=1, format="text");
+
+# Contingency Table
+F = table(A,B,WM);
+
+# Chi-Squared
+W = sum(F);
+r = rowSums(F);
+c = colSums(F);
+E = (r %*% c)/W;
+T = (F-E)^2/E;
+chi_squared = sum(T);
+
+# compute p-value
+degFreedom = (nrow(F)-1)*(ncol(F)-1);
+pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+# Cramer's V
+R = nrow(F);
+C = ncol(F);
+q = min(R,C);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(pValue, $5);
+write(cramers_v, $6);
+