You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:06 UTC
[30/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/stratstats.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/stratstats.dml b/src/test/scripts/applications/ctableStats/stratstats.dml
index 3ac684a..7eb1858 100644
--- a/src/test/scripts/applications/ctableStats/stratstats.dml
+++ b/src/test/scripts/applications/ctableStats/stratstats.dml
@@ -1,350 +1,350 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# STRATIFIED BIVARIATE STATISTICS, VERSION 2
-#
-# INPUT 1: Dataset with records as rows (matrix filename)
-# INPUT 2: The stratum ID column number (integer)
-# Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive.
-# INPUT 3: 1st variate column numbers (matrix filename)
-# INPUT 4: 2nd variate column numbers (matrix filename)
-# INPUT 5: Output (matrix filename)
-#
-# OUTPUT 1: Output Matrix with 40 columns, containing the following information:
-# Rows: One row per each distinct pair (1st variate, 2nd variate)
-# Col 01: 1st variate column number
-# Col 02: 1st variate global presence count
-# Col 03: 1st variate global mean
-# Col 04: 1st variate global standard deviation
-# Col 05: 1st variate stratified standard deviation
-# Col 06: R-squared, 1st variate vs. strata
-# Col 07: P-value, 1st variate vs. strata
-# Col 08-10: Reserved
-# Col 11: 2nd variate column number
-# Col 12: 2nd variate global presence count
-# Col 13: 2nd variate global mean
-# Col 14: 2nd variate global standard deviation
-# Col 15: 2nd variate stratified standard deviation
-# Col 16: R-squared, 2nd variate vs. strata
-# Col 17: P-value, 2nd variate vs. strata
-# Col 18-20: Reserved
-# Col 21: Global 1st & 2nd variate presence count
-# Col 22: Global regression slope (2nd variate vs. 1st variate)
-# Col 23: Global regression slope standard deviation
-# Col 24: Global correlation = +/- sqrt(R-squared)
-# Col 25: Global residual standard deviation
-# Col 26: Global R-squared
-# Col 27: Global P-value for hypothesis "slope = 0"
-# Col 28-30: Reserved
-# Col 31: Stratified 1st & 2nd variate presence count
-# Col 32: Stratified regression slope (2nd variate vs. 1st variate)
-# Col 33: Stratified regression slope standard deviation
-# Col 34: Stratified correlation = +/- sqrt(R-squared)
-# Col 35: Stratified residual standard deviation
-# Col 36: Stratified R-squared
-# Col 37: Stratified P-value for hypothesis "slope = 0"
-# Col 38: Number of strata with at least two counted points
-# Col 39-40: Reserved
-# TO DO: GOODNESS OF FIT MEASURE
-#
-# EXAMPLE:
-# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx
-
-NaN = 0/0;
-
-print ("BEGIN STRATIFIED STATISTICS SCRIPT");
-
-print ("Reading the input matrices...");
-
-DataWithNaNs = read ($1, format = "text");
-Xcols = read ($3, format = "text");
-Ycols = read ($4, format = "text");
-stratum_column_id = $2;
-num_records = nrow(DataWithNaNs);
-num_attrs = ncol(DataWithNaNs);
-num_attrs_X = ncol(Xcols);
-num_attrs_Y = ncol(Ycols);
-num_attrs_XY = num_attrs_X * num_attrs_Y;
-
-
-print ("Preparing the variates...");
-
-Data = deNaN (DataWithNaNs);
-DataNaNmask = ppred (DataWithNaNs, NaN, "==");
-
-tXcols = t(Xcols);
-ones = matrix (1.0, rows = num_attrs_X, cols = 1);
-one_to_num_attrs_X = sumup (ones);
-ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X);
-ProjX_ctable = table (tXcols, one_to_num_attrs_X);
-ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable;
-X = Data %*% ProjX;
-X_mask = 1 - (DataNaNmask %*% ProjX);
-
-tYcols = t(Ycols);
-ones = matrix (1.0, rows = num_attrs_Y, cols = 1);
-one_to_num_attrs_Y = sumup (ones);
-ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y);
-ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
-ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable;
-Y = Data %*% ProjY;
-Y_mask = 1 - (DataNaNmask %*% ProjY);
-
-
-print ("Preparing the strata...");
-
-Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]);
-Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows");
-vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id]));
-vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs));
-num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs);
-num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata);
-ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1);
-one_to_num_records_with_nonNaN_strata = sumup (ones);
-StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata);
-StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows");
-StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata;
-num_strata = nrow (StrataSummator);
-num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata;
-print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata.");
-
-print ("Computing the global single-variate statistics...");
-
-cnt_X_global = colSums (X_mask);
-cnt_Y_global = colSums (Y_mask);
-avg_X_global = colSums (X) / cnt_X_global;
-avg_Y_global = colSums (Y) / cnt_Y_global;
-var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
-var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
- sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
-stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
- sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
-stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
-
-print ("Computing the stratified single-variate statistics...");
-
-# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
-
-Cnt_X_per_stratum = StrataSummator %*% X_mask;
-Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
-Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "==");
-Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "==");
-One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
-One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
-num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
-num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
-
-Sum_X_per_stratum = StrataSummator %*% X;
-Sum_Y_per_stratum = StrataSummator %*% Y;
-
-# Recompute some global statistics to exclude NaN-stratum records
-
-cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum);
-cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum);
-sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum);
-sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum);
-var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata;
-var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata;
-
-# Compute the stratified statistics
-
-var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
-var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
- sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
-stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
- sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
-stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
-r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata;
-r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata;
-fStat_X_vs_strata = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata));
-fStat_Y_vs_strata = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata));
-p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
-p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
-
-print ("Computing the global bivariate statistics...");
-
-# Compute the aggregate X vs. Y statistics and map them into proper positions
-
-cnt_XY_rectangle = t(X_mask) %*% Y_mask;
-sum_X_forXY_rectangle = t(X) %*% Y_mask;
-sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
-sum_Y_forXY_rectangle = t(X_mask) %*% Y;
-sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
-sum_XY_rectangle = t(X) %*% Y;
-cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
-ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
-
-# Compute the global bivariate statistics for output
-
-cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
-var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
-var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
-slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
- sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
- sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
-corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
-r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
- sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
-stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
- sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
-stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
-fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
-p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
-
-print ("Computing the stratified bivariate statistics...");
-
-# Create projections to "intermingle" X and Y into attribute pairs
-
-Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
-Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
-ones_Y_row = matrix (1.0, rows = 1, cols = num_attrs_Y);
-for (i in 1:num_attrs_X) {
- start_cid = (i - 1) * num_attrs_Y + 1;
- end_cid = i * num_attrs_Y;
- Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row;
- Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row);
-}
-
-# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
-
-Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
-Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
-Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
-Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
-
-Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "==");
-One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
-num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
-
-# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records
-
-cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum);
-sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum);
-sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum);
-sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum);
-
-# Compute the stratified bivariate statistics
-
-var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-cov_sumX_sumY_stratified = sum_XY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
-
-slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
- sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
- sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
-corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
-r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
-r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified;
- sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_9);
- sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
-fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
-p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
-
-print ("Preparing the output matrix...");
-OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
-
-OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st variate column number
-OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st variate global presence count
-OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st variate global mean
-OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st variate global standard deviation
-OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st variate stratified standard deviation
-OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st variate vs. strata
-OutMtx [ 7, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st variate vs. strata
-OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd variate column number
-OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd variate global presence count
-OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd variate global mean
-OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd variate global standard deviation
-OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd variate stratified standard deviation
-OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd variate vs. strata
-OutMtx [17, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd variate vs. strata
-
-
-OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd variate presence count
-OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd variate vs. 1st variate)
-OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
-OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
-OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
-OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
-OutMtx [27, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
-OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count
-OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd variate vs. 1st variate)
-OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
-OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
-OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
-OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
-OutMtx [37, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
-OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points
-
-OutMtx = t(OutMtx);
-
-print ("Writing the output matrix...");
-write (OutMtx, $5, format="text");
-print ("END STRATIFIED STATISTICS SCRIPT");
-
-
-deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B)
- implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem");
-
-fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
-{ # TEMPORARY IMPLEMENTATION
- tailprob = fStat;
- for (i in 1:nrow(fStat)) {
- for (j in 1:ncol(fStat)) {
- q = castAsScalar (fStat [i, j]);
- d1 = castAsScalar (df_1 [i, j]);
- d2 = castAsScalar (df_2 [i, j]);
- if (d1 >= 1 & d2 >= 1 & q >= 0.0) {
- tailprob [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
- } else {
- tailprob [i, j] = 0/0;
- }
- } }
-}
-
-sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
-{
- NaN = 0/0;
- mask_A = ppred (input_A, 0.0, ">=");
- prep_A = input_A * mask_A;
- mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "=="));
- prep_A = deNaN (prep_A);
- output_A = sqrt (prep_A) / mask_A;
-}
-
-sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
-{
- shift = 1;
- m_A = nrow(A);
- sum_A = A;
- while (shift < m_A) {
- sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
- shift = 2 * shift;
- }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# STRATIFIED BIVARIATE STATISTICS, VERSION 2
+#
+# INPUT 1: Dataset with records as rows (matrix filename)
+# INPUT 2: The stratum ID column number (integer)
+# Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive.
+# INPUT 3: 1st variate column numbers (matrix filename)
+# INPUT 4: 2nd variate column numbers (matrix filename)
+# INPUT 5: Output (matrix filename)
+#
+# OUTPUT 1: Output Matrix with 40 columns, containing the following information:
+# Rows: One row per each distinct pair (1st variate, 2nd variate)
+# Col 01: 1st variate column number
+# Col 02: 1st variate global presence count
+# Col 03: 1st variate global mean
+# Col 04: 1st variate global standard deviation
+# Col 05: 1st variate stratified standard deviation
+# Col 06: R-squared, 1st variate vs. strata
+# Col 07: P-value, 1st variate vs. strata
+# Col 08-10: Reserved
+# Col 11: 2nd variate column number
+# Col 12: 2nd variate global presence count
+# Col 13: 2nd variate global mean
+# Col 14: 2nd variate global standard deviation
+# Col 15: 2nd variate stratified standard deviation
+# Col 16: R-squared, 2nd variate vs. strata
+# Col 17: P-value, 2nd variate vs. strata
+# Col 18-20: Reserved
+# Col 21: Global 1st & 2nd variate presence count
+# Col 22: Global regression slope (2nd variate vs. 1st variate)
+# Col 23: Global regression slope standard deviation
+# Col 24: Global correlation = +/- sqrt(R-squared)
+# Col 25: Global residual standard deviation
+# Col 26: Global R-squared
+# Col 27: Global P-value for hypothesis "slope = 0"
+# Col 28-30: Reserved
+# Col 31: Stratified 1st & 2nd variate presence count
+# Col 32: Stratified regression slope (2nd variate vs. 1st variate)
+# Col 33: Stratified regression slope standard deviation
+# Col 34: Stratified correlation = +/- sqrt(R-squared)
+# Col 35: Stratified residual standard deviation
+# Col 36: Stratified R-squared
+# Col 37: Stratified P-value for hypothesis "slope = 0"
+# Col 38: Number of strata with at least two counted points
+# Col 39-40: Reserved
+# TO DO: GOODNESS OF FIT MEASURE
+#
+# EXAMPLE:
+# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx
+
+NaN = 0/0;
+
+print ("BEGIN STRATIFIED STATISTICS SCRIPT");
+
+print ("Reading the input matrices...");
+
+DataWithNaNs = read ($1, format = "text");
+Xcols = read ($3, format = "text");
+Ycols = read ($4, format = "text");
+stratum_column_id = $2;
+num_records = nrow(DataWithNaNs);
+num_attrs = ncol(DataWithNaNs);
+num_attrs_X = ncol(Xcols);
+num_attrs_Y = ncol(Ycols);
+num_attrs_XY = num_attrs_X * num_attrs_Y;
+
+
+print ("Preparing the variates...");
+
+Data = deNaN (DataWithNaNs);
+DataNaNmask = ppred (DataWithNaNs, NaN, "==");
+
+tXcols = t(Xcols);
+ones = matrix (1.0, rows = num_attrs_X, cols = 1);
+one_to_num_attrs_X = sumup (ones);
+ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X);
+ProjX_ctable = table (tXcols, one_to_num_attrs_X);
+ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable;
+X = Data %*% ProjX;
+X_mask = 1 - (DataNaNmask %*% ProjX);
+
+tYcols = t(Ycols);
+ones = matrix (1.0, rows = num_attrs_Y, cols = 1);
+one_to_num_attrs_Y = sumup (ones);
+ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y);
+ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
+ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable;
+Y = Data %*% ProjY;
+Y_mask = 1 - (DataNaNmask %*% ProjY);
+
+
+print ("Preparing the strata...");
+
+Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]);
+Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows");
+vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id]));
+vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs));
+num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs);
+num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata);
+ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1);
+one_to_num_records_with_nonNaN_strata = sumup (ones);
+StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata);
+StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows");
+StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata;
+num_strata = nrow (StrataSummator);
+num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata;
+print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata.");
+
+print ("Computing the global single-variate statistics...");
+
+cnt_X_global = colSums (X_mask);
+cnt_Y_global = colSums (Y_mask);
+avg_X_global = colSums (X) / cnt_X_global;
+avg_Y_global = colSums (Y) / cnt_Y_global;
+var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
+var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
+ sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
+stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
+ sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
+stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
+
+print ("Computing the stratified single-variate statistics...");
+
+# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
+
+Cnt_X_per_stratum = StrataSummator %*% X_mask;
+Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
+Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "==");
+Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "==");
+One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
+One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
+num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
+num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
+
+Sum_X_per_stratum = StrataSummator %*% X;
+Sum_Y_per_stratum = StrataSummator %*% Y;
+
+# Recompute some global statistics to exclude NaN-stratum records
+
+cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum);
+cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum);
+sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum);
+sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum);
+var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata;
+var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata;
+
+# Compute the stratified statistics
+
+var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
+var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
+ sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
+stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
+ sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
+stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
+r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata;
+r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata;
+fStat_X_vs_strata = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata));
+fStat_Y_vs_strata = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata));
+p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata);
+p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata);
+
+print ("Computing the global bivariate statistics...");
+
+# Compute the aggregate X vs. Y statistics and map them into proper positions
+
+cnt_XY_rectangle = t(X_mask) %*% Y_mask;
+sum_X_forXY_rectangle = t(X) %*% Y_mask;
+sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
+sum_Y_forXY_rectangle = t(X_mask) %*% Y;
+sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
+sum_XY_rectangle = t(X) %*% Y;
+cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
+
+# Compute the global bivariate statistics for output
+
+cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
+var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
+ sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
+ sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
+corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
+r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
+ sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
+stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
+ sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
+stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
+fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
+p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
+
+print ("Computing the stratified bivariate statistics...");
+
+# Create projections to "intermingle" X and Y into attribute pairs
+
+Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
+Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
+ones_Y_row = matrix (1.0, rows = 1, cols = num_attrs_Y);
+for (i in 1:num_attrs_X) {
+ start_cid = (i - 1) * num_attrs_Y + 1;
+ end_cid = i * num_attrs_Y;
+ Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row;
+ Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row);
+}
+
+# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata
+
+Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
+Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
+Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
+
+Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "==");
+One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
+num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
+
+# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records
+
+cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum);
+sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum);
+sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum);
+sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum);
+
+# Compute the stratified bivariate statistics
+
+var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+cov_sumX_sumY_stratified = sum_XY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+
+slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
+ sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
+ sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
+corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
+r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
+r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified;
+ sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_9);
+ sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
+fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
+p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1);
+
+print ("Preparing the output matrix...");
+OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
+
+OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st variate column number
+OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st variate global presence count
+OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st variate global mean
+OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st variate global standard deviation
+OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st variate stratified standard deviation
+OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st variate vs. strata
+OutMtx [ 7, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st variate vs. strata
+OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd variate column number
+OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd variate global presence count
+OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd variate global mean
+OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd variate global standard deviation
+OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd variate stratified standard deviation
+OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd variate vs. strata
+OutMtx [17, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd variate vs. strata
+
+
+OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd variate presence count
+OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd variate vs. 1st variate)
+OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
+OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
+OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
+OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
+OutMtx [27, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
+OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count
+OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd variate vs. 1st variate)
+OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
+OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
+OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
+OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
+OutMtx [37, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
+OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points
+
+OutMtx = t(OutMtx);
+
+print ("Writing the output matrix...");
+write (OutMtx, $5, format="text");
+print ("END STRATIFIED STATISTICS SCRIPT");
+
+
+deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B)
+ implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem");
+
+fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
+{ # TEMPORARY IMPLEMENTATION
+ tailprob = fStat;
+ for (i in 1:nrow(fStat)) {
+ for (j in 1:ncol(fStat)) {
+ q = castAsScalar (fStat [i, j]);
+ d1 = castAsScalar (df_1 [i, j]);
+ d2 = castAsScalar (df_2 [i, j]);
+ if (d1 >= 1 & d2 >= 1 & q >= 0.0) {
+ tailprob [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
+ } else {
+ tailprob [i, j] = 0/0;
+ }
+ } }
+}
+
+sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
+{
+ NaN = 0/0;
+ mask_A = ppred (input_A, 0.0, ">=");
+ prep_A = input_A * mask_A;
+ mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "=="));
+ prep_A = deNaN (prep_A);
+ output_A = sqrt (prep_A) / mask_A;
+}
+
+sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
+{
+ shift = 1;
+ m_A = nrow(A);
+ sum_A = A;
+ while (shift < m_A) {
+ sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
+ shift = 2 * shift;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/wilson_score.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/wilson_score.dml b/src/test/scripts/applications/ctableStats/wilson_score.dml
index 90db3fc..27d0899 100644
--- a/src/test/scripts/applications/ctableStats/wilson_score.dml
+++ b/src/test/scripts/applications/ctableStats/wilson_score.dml
@@ -1,145 +1,145 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score
-# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n
-# INPUT 2: The number of rows
-# INPUT 3: The output file
-# OUTPUT : Matrix [rows, 15] of doubles, containing the following information:
-# (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right,
-# n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right,
-# m / n, Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right)
-# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION!
-# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx"
-
-setwd ("test/scripts/applications/ctableStats");
-source ("Binomial.dml");
-
-# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6);
-# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0);
-# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421);
-# [alpha] = binomProb (test_n, test_m, test_p);
-# print ("TEST: Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "] = " + castAsScalar (alpha));
-
-print ("BEGIN WILSON SCORE SCRIPT");
-print ("Reading X...");
-X = read ($1, rows = $2, cols = 2, format = "text");
-num_rows = $2;
-print ("Performing the computation...");
-
-M = X [, 1];
-N = X [, 2];
-blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0);
-sum_M = blahh * sum(M);
-sum_N = blahh * sum(N);
-
-[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M);
-[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N);
-[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M);
-
-M_minus_1 = M - 1;
-N_minus_1 = N - 1;
-big_alpha = 0.975 * blahh;
-small_alpha = 0.025 * blahh;
-
-[l_m_sum_exact] = binomQuantile (sum_M, M_minus_1, big_alpha);
-[r_m_sum_exact] = binomQuantile (sum_M, M, small_alpha);
-[l_n_sum_exact] = binomQuantile (sum_N, N_minus_1, big_alpha);
-[r_n_sum_exact] = binomQuantile (sum_N, N, small_alpha);
-[l_m_n_exact] = binomQuantile (N, M_minus_1, big_alpha);
-[r_m_n_exact] = binomQuantile (N, M, small_alpha);
-
-result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0);
-result [, 1] = p_m_sum;
-result [, 2] = l_m_sum_wilson;
-result [, 3] = r_m_sum_wilson;
-result [, 4] = l_m_sum_exact;
-result [, 5] = r_m_sum_exact;
-result [, 6] = p_n_sum;
-result [, 7] = l_n_sum_wilson;
-result [, 8] = r_n_sum_wilson;
-result [, 9] = l_n_sum_exact;
-result [, 10] = r_n_sum_exact;
-result [, 11] = p_m_n;
-result [, 12] = l_m_n_wilson;
-result [, 13] = r_m_n_wilson;
-result [, 14] = l_m_n_exact;
-result [, 15] = r_m_n_exact;
-
-print ("M / sum(M) RESULTS: Wilson, Exact");
-
-for (i in 1:num_rows) {
- p1 = castAsScalar (round (result [i, 1] * 100000) / 1000);
- lw1 = castAsScalar (round (result [i, 2] * 100000) / 1000);
- rw1 = castAsScalar (round (result [i, 3] * 100000) / 1000);
- le1 = castAsScalar (round (result [i, 4] * 100000) / 1000);
- re1 = castAsScalar (round (result [i, 5] * 100000) / 1000);
- print ("Row " + i + ": "
- + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = "
- + p1 + "% [" + lw1 + "%, " + rw1 + "%] [" + le1 + "%, " + re1 + "%]");
-}
-
-print ("N / sum(N) RESULTS: Wilson, Exact");
-
-for (i in 1:num_rows) {
- p2 = castAsScalar (round (result [i, 6] * 100000) / 1000);
- lw2 = castAsScalar (round (result [i, 7] * 100000) / 1000);
- rw2 = castAsScalar (round (result [i, 8] * 100000) / 1000);
- le2 = castAsScalar (round (result [i, 9] * 100000) / 1000);
- re2 = castAsScalar (round (result [i, 10] * 100000) / 1000);
- print ("Row " + i + ": "
- + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = "
- + p2 + "% [" + lw2 + "%, " + rw2 + "%] [" + le2 + "%, " + re2 + "%] ");
-}
-
-print ("M / N RESULTS: Wilson, Exact");
-
-for (i in 1:num_rows) {
- p3 = castAsScalar (round (result [i, 11] * 100000) / 1000);
- lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000);
- rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000);
- le3 = castAsScalar (round (result [i, 14] * 100000) / 1000);
- re3 = castAsScalar (round (result [i, 15] * 100000) / 1000);
- print ("Row " + i + ": "
- + castAsScalar (M [i, 1]) + "/" + castAsScalar ( N [i, 1]) + " = "
- + p3 + "% [" + lw3 + "%, " + rw3 + "%] [" + le3 + "%, " + re3 + "%] ");
-}
-
-
-
-print ("Writing the results...");
-write (result, $3, format = "text");
-print ("END WILSON SCORE SCRIPT");
-
-
-wilson_confidence = function (Matrix[double] n, Matrix[double] m)
-return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right)
-{
- z = 1.96; # 97.5% normal percentile, for 95% confidence interval
- z_sq_n = z * z * n;
- qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4));
- midpt = n * m + z_sq_n / 2;
- denom = n * n + z_sq_n;
- ratio = m / n;
- conf_left = (midpt - qroot) / denom;
- conf_right = (midpt + qroot) / denom;
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score
+# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n
+# INPUT 2: The number of rows
+# INPUT 3: The output file
+# OUTPUT : Matrix [rows, 15] of doubles, containing the following information:
+# (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right,
+# n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right,
+# m / n, Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right)
+# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION!
+# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx"
+
+setwd ("test/scripts/applications/ctableStats");
+source ("Binomial.dml");
+
+# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6);
+# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0);
+# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421);
+# [alpha] = binomProb (test_n, test_m, test_p);
+# print ("TEST: Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "] = " + castAsScalar (alpha));
+
+print ("BEGIN WILSON SCORE SCRIPT");
+print ("Reading X...");
+X = read ($1, rows = $2, cols = 2, format = "text");
+num_rows = $2;
+print ("Performing the computation...");
+
+M = X [, 1];
+N = X [, 2];
+blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0);
+sum_M = blahh * sum(M);
+sum_N = blahh * sum(N);
+
+[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M);
+[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N);
+[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M);
+
+M_minus_1 = M - 1;
+N_minus_1 = N - 1;
+big_alpha = 0.975 * blahh;
+small_alpha = 0.025 * blahh;
+
+[l_m_sum_exact] = binomQuantile (sum_M, M_minus_1, big_alpha);
+[r_m_sum_exact] = binomQuantile (sum_M, M, small_alpha);
+[l_n_sum_exact] = binomQuantile (sum_N, N_minus_1, big_alpha);
+[r_n_sum_exact] = binomQuantile (sum_N, N, small_alpha);
+[l_m_n_exact] = binomQuantile (N, M_minus_1, big_alpha);
+[r_m_n_exact] = binomQuantile (N, M, small_alpha);
+
+result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0);
+result [, 1] = p_m_sum;
+result [, 2] = l_m_sum_wilson;
+result [, 3] = r_m_sum_wilson;
+result [, 4] = l_m_sum_exact;
+result [, 5] = r_m_sum_exact;
+result [, 6] = p_n_sum;
+result [, 7] = l_n_sum_wilson;
+result [, 8] = r_n_sum_wilson;
+result [, 9] = l_n_sum_exact;
+result [, 10] = r_n_sum_exact;
+result [, 11] = p_m_n;
+result [, 12] = l_m_n_wilson;
+result [, 13] = r_m_n_wilson;
+result [, 14] = l_m_n_exact;
+result [, 15] = r_m_n_exact;
+
+print ("M / sum(M) RESULTS: Wilson, Exact");
+
+for (i in 1:num_rows) {
+ p1 = castAsScalar (round (result [i, 1] * 100000) / 1000);
+ lw1 = castAsScalar (round (result [i, 2] * 100000) / 1000);
+ rw1 = castAsScalar (round (result [i, 3] * 100000) / 1000);
+ le1 = castAsScalar (round (result [i, 4] * 100000) / 1000);
+ re1 = castAsScalar (round (result [i, 5] * 100000) / 1000);
+ print ("Row " + i + ": "
+ + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = "
+ + p1 + "% [" + lw1 + "%, " + rw1 + "%] [" + le1 + "%, " + re1 + "%]");
+}
+
+print ("N / sum(N) RESULTS: Wilson, Exact");
+
+for (i in 1:num_rows) {
+ p2 = castAsScalar (round (result [i, 6] * 100000) / 1000);
+ lw2 = castAsScalar (round (result [i, 7] * 100000) / 1000);
+ rw2 = castAsScalar (round (result [i, 8] * 100000) / 1000);
+ le2 = castAsScalar (round (result [i, 9] * 100000) / 1000);
+ re2 = castAsScalar (round (result [i, 10] * 100000) / 1000);
+ print ("Row " + i + ": "
+ + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = "
+ + p2 + "% [" + lw2 + "%, " + rw2 + "%] [" + le2 + "%, " + re2 + "%] ");
+}
+
+print ("M / N RESULTS: Wilson, Exact");
+
+for (i in 1:num_rows) {
+ p3 = castAsScalar (round (result [i, 11] * 100000) / 1000);
+ lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000);
+ rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000);
+ le3 = castAsScalar (round (result [i, 14] * 100000) / 1000);
+ re3 = castAsScalar (round (result [i, 15] * 100000) / 1000);
+ print ("Row " + i + ": "
+ + castAsScalar (M [i, 1]) + "/" + castAsScalar ( N [i, 1]) + " = "
+ + p3 + "% [" + lw3 + "%, " + rw3 + "%] [" + le3 + "%, " + re3 + "%] ");
+}
+
+
+
+print ("Writing the results...");
+write (result, $3, format = "text");
+print ("END WILSON SCORE SCRIPT");
+
+
+wilson_confidence = function (Matrix[double] n, Matrix[double] m)
+return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right)
+{
+ z = 1.96; # 97.5% normal percentile, for 95% confidence interval
+ z_sq_n = z * z * n;
+ qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4));
+ midpt = n * m + z_sq_n / 2;
+ denom = n * n + z_sq_n;
+ ratio = m / n;
+ conf_left = (midpt - qroot) / denom;
+ conf_right = (midpt + qroot) / denom;
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/zipftest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/ctableStats/zipftest.dml b/src/test/scripts/applications/ctableStats/zipftest.dml
index 08fe217..50ac5f4 100644
--- a/src/test/scripts/applications/ctableStats/zipftest.dml
+++ b/src/test/scripts/applications/ctableStats/zipftest.dml
@@ -1,77 +1,77 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Generator of random records with boolean features
-# Average record (row) and feature (column) densities follow
-# power laws: E(#1s in line k) = const / (k + add)^pow
-# Cell[1, 1] has the highest probability to be 1, also input
-
-# By setting num_features >> num_records we allow lots of rare
-# features while keeping most records nonempty.
-# The power ("pow") in the power law determines the tail behavior;
-# The additive ("add") determines how steeply the density changes
-# in the first few records or features.
-
-num_records = 1000; # The number of records (rows)
-num_features = 50000; # The number of boolean features (columns)
-
-pow_records = 2.0; # The Zipf law power for record density
-pow_features = 1.0; # The Zipf law power for feature density
-
-add_records = 100.0; # The additive shift for record density
-add_features = 20.0; # The additive shift for feature density
-
-max_cell_prob = 1.0; # The probability for Cell[1, 1] to be 1
-
-############
-
-c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features);
-
-vec_records = matrix (1.0, rows = num_records, cols = 1);
-vec_records = sumup (vec_records);
-vec_records = 1.0 / ((vec_records + add_records)^pow_records);
-
-vec_features = matrix (1.0, rows = num_features, cols = 1);
-vec_features = sumup (vec_features);
-vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features);
-
-Probs = c * (vec_records %*% vec_features);
-avg_density_records = rowSums (Probs);
-avg_density_features = colSums (Probs);
-
-Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0);
-Data = ppred (Tosses, Probs, "<=");
-
-write (avg_density_records, "Zipf.AvgDensity.Rows", format="text");
-write (avg_density_features, "Zipf.AvgDensity.Cols", format="text");
-write (Data, "Zipf.Data", format="text");
-
-
-sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
-{
- shift = 1;
- m_A = nrow(A);
- sum_A = A;
- while (shift < m_A) {
- sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
- shift = 2 * shift;
- }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Generator of random records with boolean features
+# Average record (row) and feature (column) densities follow
+# power laws: E(#1s in line k) = const / (k + add)^pow
+# Cell[1, 1] has the highest probability to be 1, also input
+
+# By setting num_features >> num_records we allow lots of rare
+# features while keeping most records nonempty.
+# The power ("pow") in the power law determines the tail behavior;
+# The additive ("add") determines how steeply the density changes
+# in the first few records or features.
+
+num_records = 1000; # The number of records (rows)
+num_features = 50000; # The number of boolean features (columns)
+
+pow_records = 2.0; # The Zipf law power for record density
+pow_features = 1.0; # The Zipf law power for feature density
+
+add_records = 100.0; # The additive shift for record density
+add_features = 20.0; # The additive shift for feature density
+
+max_cell_prob = 1.0; # The probability for Cell[1, 1] to be 1
+
+############
+
+c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features);
+
+vec_records = matrix (1.0, rows = num_records, cols = 1);
+vec_records = sumup (vec_records);
+vec_records = 1.0 / ((vec_records + add_records)^pow_records);
+
+vec_features = matrix (1.0, rows = num_features, cols = 1);
+vec_features = sumup (vec_features);
+vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features);
+
+Probs = c * (vec_records %*% vec_features);
+avg_density_records = rowSums (Probs);
+avg_density_features = colSums (Probs);
+
+Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0);
+Data = ppred (Tosses, Probs, "<=");
+
+write (avg_density_records, "Zipf.AvgDensity.Rows", format="text");
+write (avg_density_features, "Zipf.AvgDensity.Cols", format="text");
+write (Data, "Zipf.Data", format="text");
+
+
+sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
+{
+ shift = 1;
+ m_A = nrow(A);
+ sum_A = A;
+ while (shift < m_A) {
+ sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
+ shift = 2 * shift;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/Categorical.R b/src/test/scripts/applications/descriptivestats/Categorical.R
index 46c380d..f88c8bf 100644
--- a/src/test/scripts/applications/descriptivestats/Categorical.R
+++ b/src/test/scripts/applications/descriptivestats/Categorical.R
@@ -1,57 +1,57 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java
-# command line invocation assuming $C_HOME is set to the home of the R script
-# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/
-args <- commandArgs(TRUE)
-options(digits=22)
-
-#library("batch")
-library("Matrix")
-
-V = readMM(paste(args[1], "vector.mtx", sep=""))
-
-tab = table(V[,1])
-cat = t(as.numeric(names(tab)))
-Nc = t(as.vector(tab))
-
-# the number of categories of a categorical variable
-R = length(Nc)
-
-# total count
-s = sum(Nc)
-
-# percentage values of each categorical compare to the total case number
-Pc = Nc / s
-
-# all categorical values of a categorical variable
-C = (Nc > 0)
-
-# mode
-mx = max(Nc)
-Mode = (Nc == mx)
-
-writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text");
-write(R, paste(args[2], "R", sep=""));
-writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text");
-writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text");
-writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java
+# command line invocation assuming $C_HOME is set to the home of the R script
+# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/
+args <- commandArgs(TRUE)
+options(digits=22)
+
+#library("batch")
+library("Matrix")
+
+V = readMM(paste(args[1], "vector.mtx", sep=""))
+
+tab = table(V[,1])
+cat = t(as.numeric(names(tab)))
+Nc = t(as.vector(tab))
+
+# the number of categories of a categorical variable
+R = length(Nc)
+
+# total count
+s = sum(Nc)
+
+# percentage values of each categorical compare to the total case number
+Pc = Nc / s
+
+# all categorical values of a categorical variable
+C = (Nc > 0)
+
+# mode
+mx = max(Nc)
+Mode = (Nc == mx)
+
+writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text");
+write(R, paste(args[2], "R", sep=""));
+writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text");
+writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text");
+writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text");
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/Categorical.dml b/src/test/scripts/applications/descriptivestats/Categorical.dml
index 735f5d6..7599d06 100644
--- a/src/test/scripts/applications/descriptivestats/Categorical.dml
+++ b/src/test/scripts/applications/descriptivestats/Categorical.dml
@@ -1,55 +1,55 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script Categorical.dml?
-# Assume C_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for vector
-# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode"
-
-V = read($1, rows=$2, cols=1, format="text")
-
-# a set of number of values specify the number of cases of each categorical
-Nc = table(V,1);
-
-# the number of categories of a categorical variable
-R = nrow(Nc)
-
-# total count
-s = sum(Nc)
-
-# percentage values of each categorical compare to the total case number
-Pc = Nc / s
-
-# all categorical values of a categorical variable
-C = ppred(Nc, 0, ">")
-
-# mode
-mx = max(Nc)
-Mode = ppred(Nc, mx, "==")
-
-write(Nc, $3, format="text")
-write(R, $4)
-write(Pc, $5, format="text")
-write(C, $6, format="text")
-write(Mode, $7, format="text")
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script Categorical.dml?
+# Assume C_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for vector
+# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode"
+
+V = read($1, rows=$2, cols=1, format="text")
+
+# a set of number of values specify the number of cases of each categorical
+Nc = table(V,1);
+
+# the number of categories of a categorical variable
+R = nrow(Nc)
+
+# total count
+s = sum(Nc)
+
+# percentage values of each categorical compare to the total case number
+Pc = Nc / s
+
+# all categorical values of a categorical variable
+C = ppred(Nc, 0, ">")
+
+# mode
+mx = max(Nc)
+Mode = ppred(Nc, mx, "==")
+
+write(Nc, $3, format="text")
+write(R, $4)
+write(Pc, $5, format="text")
+write(C, $6, format="text")
+write(Mode, $7, format="text")
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
index 7c9785c..24a391f 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R
@@ -1,49 +1,49 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
-# command line invocation assuming $CC_HOME is set to the home of the R script
-# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/
-args <- commandArgs(TRUE)
-options(digits=22)
-
-library("Matrix")
-
-A = readMM(paste(args[1], "A.mtx", sep=""));
-B = readMM(paste(args[1], "B.mtx", sep=""));
-
-F = table(A[,1],B[,1]);
-
-# chisq.test returns a list containing statistic, p-value, etc.
-cst = chisq.test(F);
-
-# get the chi-squared coefficient from the list
-chi_squared = as.numeric(cst[1]);
-pValue = as.numeric(cst[3]);
-
-write(pValue, paste(args[2], "PValue", sep=""));
-
-q = min(dim(F));
-W = sum(F);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(cramers_v, paste(args[2], "CramersV", sep=""));
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
+# command line invocation assuming $CC_HOME is set to the home of the R script
+# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+A = readMM(paste(args[1], "A.mtx", sep=""));
+B = readMM(paste(args[1], "B.mtx", sep=""));
+
+F = table(A[,1],B[,1]);
+
+# chisq.test returns a list containing statistic, p-value, etc.
+cst = chisq.test(F);
+
+# get the chi-squared coefficient from the list
+chi_squared = as.numeric(cst[1]);
+pValue = as.numeric(cst[3]);
+
+write(pValue, paste(args[2], "PValue", sep=""));
+
+q = min(dim(F));
+W = sum(F);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(cramers_v, paste(args[2], "CramersV", sep=""));
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
index 626bfeb..4301f91 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml
@@ -1,56 +1,56 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script CategoricalCategorical.dml?
-# Assume CC_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for both A and B
-# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
-
-A = read($1, rows=$2, cols=1, format="text");
-B = read($3, rows=$2, cols=1, format="text");
-
-# Contingency Table
-F = table(A,B);
-
-# Chi-Squared
-W = sum(F);
-r = rowSums(F);
-c = colSums(F);
-E = (r %*% c)/W;
-T = (F-E)^2/E;
-chi_squared = sum(T);
-
-# compute p-value
-degFreedom = (nrow(F)-1)*(ncol(F)-1);
-pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
-# Cramer's V
-R = nrow(F);
-C = ncol(F);
-q = min(R,C);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(pValue, $4);
-write(cramers_v, $5);
-
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script CategoricalCategorical.dml?
+# Assume CC_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for both A and B
+# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
+
+A = read($1, rows=$2, cols=1, format="text");
+B = read($3, rows=$2, cols=1, format="text");
+
+# Contingency Table
+F = table(A,B);
+
+# Chi-Squared
+W = sum(F);
+r = rowSums(F);
+c = colSums(F);
+E = (r %*% c)/W;
+T = (F-E)^2/E;
+chi_squared = sum(T);
+
+# compute p-value
+degFreedom = (nrow(F)-1)*(ncol(F)-1);
+pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+# Cramer's V
+R = nrow(F);
+C = ncol(F);
+q = min(R,C);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(pValue, $4);
+write(cramers_v, $5);
+
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
index 9e3f797..bacc7e3 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R
@@ -1,67 +1,67 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
-# command line invocation assuming $CC_HOME is set to the home of the R script
-# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/
-# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R
-
-args <- commandArgs(TRUE)
-options(digits=22)
-
-library("Matrix")
-
-#parseCommandArgs()
-######################
-
-print(commandArgs(TRUE)[1])
-
-A = readMM(paste(args[1], "A.mtx", sep=""));
-B = readMM(paste(args[1], "B.mtx", sep=""));
-WM = readMM(paste(args[1], "WM.mtx", sep=""));
-
-Av = A[,1];
-Bv = B[,1];
-WMv = WM[,1];
-
-# create a data frame with vectors A, B, WM
-df = data.frame(Av,Bv,WMv);
-
-# contingency table with weights
-F = xtabs ( WMv ~ Av + Bv, df);
-
-# chisq.test returns a list containing statistic, p-value, etc.
-cst = chisq.test(F);
-
-# get the chi-squared coefficient from the list
-chi_squared = as.numeric(cst[1]);
-pValue = as.numeric(cst[3]);
-
-write(pValue, paste(args[2], "PValue", sep=""));
-
-#######################
-
-q = min(dim(F));
-W = sum(F);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(cramers_v, paste(args[2], "CramersV", sep=""));
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java
+# command line invocation assuming $CC_HOME is set to the home of the R script
+# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/
+# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+#parseCommandArgs()
+######################
+
+print(commandArgs(TRUE)[1])
+
+A = readMM(paste(args[1], "A.mtx", sep=""));
+B = readMM(paste(args[1], "B.mtx", sep=""));
+WM = readMM(paste(args[1], "WM.mtx", sep=""));
+
+Av = A[,1];
+Bv = B[,1];
+WMv = WM[,1];
+
+# create a data frame with vectors A, B, WM
+df = data.frame(Av,Bv,WMv);
+
+# contingency table with weights
+F = xtabs ( WMv ~ Av + Bv, df);
+
+# chisq.test returns a list containing statistic, p-value, etc.
+cst = chisq.test(F);
+
+# get the chi-squared coefficient from the list
+chi_squared = as.numeric(cst[1]);
+pValue = as.numeric(cst[3]);
+
+write(pValue, paste(args[2], "PValue", sep=""));
+
+#######################
+
+q = min(dim(F));
+W = sum(F);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(cramers_v, paste(args[2], "CramersV", sep=""));
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
index f92f42c..70e50f4 100644
--- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
+++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml
@@ -1,60 +1,60 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Note this script is externalized to customers, please do not change w/o consulting component owner.
-# How to invoke this dml script CategoricalCategorical.dml?
-# Assume CC_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume rows = 10000 for both A and B
-# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
-
-# A <- nominal
-# B <- nominal
-# WM <- weights
-
-A = read($1, rows=$2, cols=1, format="text");
-B = read($3, rows=$2, cols=1, format="text");
-WM = read($4, rows=$2, cols=1, format="text");
-
-# Contingency Table
-F = table(A,B,WM);
-
-# Chi-Squared
-W = sum(F);
-r = rowSums(F);
-c = colSums(F);
-E = (r %*% c)/W;
-T = (F-E)^2/E;
-chi_squared = sum(T);
-
-# compute p-value
-degFreedom = (nrow(F)-1)*(ncol(F)-1);
-pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
-# Cramer's V
-R = nrow(F);
-C = ncol(F);
-q = min(R,C);
-cramers_v = sqrt(chi_squared/(W*(q-1)));
-
-write(pValue, $5);
-write(cramers_v, $6);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Note this script is externalized to customers, please do not change w/o consulting component owner.
+# How to invoke this dml script CategoricalCategorical.dml?
+# Assume CC_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume rows = 10000 for both A and B
+# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV"
+
+# A <- nominal
+# B <- nominal
+# WM <- weights
+
+A = read($1, rows=$2, cols=1, format="text");
+B = read($3, rows=$2, cols=1, format="text");
+WM = read($4, rows=$2, cols=1, format="text");
+
+# Contingency Table
+F = table(A,B,WM);
+
+# Chi-Squared
+W = sum(F);
+r = rowSums(F);
+c = colSums(F);
+E = (r %*% c)/W;
+T = (F-E)^2/E;
+chi_squared = sum(T);
+
+# compute p-value
+degFreedom = (nrow(F)-1)*(ncol(F)-1);
+pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+# Cramer's V
+R = nrow(F);
+C = ncol(F);
+q = min(R,C);
+cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+write(pValue, $5);
+write(cramers_v, $6);
+