You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/07/29 21:21:34 UTC
[systemds] branch master updated: [SYSTEMDS-2845] New builtin
function stratified statistics
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new b550f69 [SYSTEMDS-2845] New builtin function stratified statistics
b550f69 is described below
commit b550f69ac9c16f5207e395f0d24bc34cd43da461
Author: juulez <Ju...@student.tugraz.at>
AuthorDate: Thu Jul 29 22:37:45 2021 +0200
[SYSTEMDS-2845] New builtin function stratified statistics
AMLS project SS2021.
Closes #1354.
---
scripts/builtin/stratstats.dml | 379 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../functions/builtin/BuiltinStratstatsTest.java | 95 ++++++
.../codegenalg/partone/AlgorithmAutoEncoder.java | 2 +-
src/test/scripts/functions/builtin/stratstats.R | 281 +++++++++++++++
src/test/scripts/functions/builtin/stratstats.dml | 26 ++
6 files changed, 783 insertions(+), 1 deletion(-)
diff --git a/scripts/builtin/stratstats.dml b/scripts/builtin/stratstats.dml
new file mode 100644
index 0000000..13500cb
--- /dev/null
+++ b/scripts/builtin/stratstats.dml
@@ -0,0 +1,379 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X Double --- Matrix X that has all 1-st covariates
+# Y Double " " Matrix Y that has all 2-nd covariates
+# the default value " " means "use X in place of Y"
+# S Double " " Matrix S that has the stratum column
+# the default value " " means "use X in place of S"
+# Xcid Double " " 1-st covariate X-column indices
+# the default value " " means "use columns 1 : ncol(X)"
+# Ycid Double " " 2-nd covariate Y-column indices
+# the default value " " means "use columns 1 : ncol(Y)"
+# Scid Int 1 Column index of the stratum column in S
+# ---------------------------------------------------------------------------------------------
+
+
+# OUTPUT MATRIX:
+# ---------------------------------------------------------------------------------------------
+# OutMtx Double --- Output matrix, one row per each distinct pair
+# (1st covariante, 2nd covariante)
+# 40 columns containing the following information:
+# Col 01: 1st covariate X-column number
+# Col 02: 1st covariate global presence count
+# Col 03: 1st covariate global mean
+# Col 04: 1st covariate global standard deviation
+# Col 05: 1st covariate stratified standard deviation
+# Col 06: R-squared, 1st covariate vs. strata
+# Col 07: adjusted R-squared, 1st covariate vs. strata
+# Col 08: P-value, 1st covariate vs. strata
+# Col 09-10: Reserved
+# Col 11: 2nd covariate Y-column number
+# Col 12: 2nd covariate global presence count
+# Col 13: 2nd covariate global mean
+# Col 14: 2nd covariate global standard deviation
+# Col 15: 2nd covariate stratified standard deviation
+# Col 16: R-squared, 2nd covariate vs. strata
+# Col 17: adjusted R-squared, 2nd covariate vs. strata
+# Col 18: P-value, 2nd covariate vs. strata
+# Col 19-20: Reserved
+# Col 21: Global 1st & 2nd covariate presence count
+# Col 22: Global regression slope (2nd vs. 1st covariate)
+# Col 23: Global regression slope standard deviation
+# Col 24: Global correlation = +/- sqrt(R-squared)
+# Col 25: Global residual standard deviation
+# Col 26: Global R-squared
+# Col 27: Global adjusted R-squared
+# Col 28: Global P-value for hypothesis "slope = 0"
+# Col 29-30: Reserved
+# Col 31: Stratified 1st & 2nd covariate presence count
+# Col 32: Stratified regression slope (2nd vs. 1st covariate)
+# Col 33: Stratified regression slope standard deviation
+# Col 34: Stratified correlation = +/- sqrt(R-squared)
+# Col 35: Stratified residual standard deviation
+# Col 36: Stratified R-squared
+# Col 37: Stratified adjusted R-squared
+# Col 38: Stratified P-value for hypothesis "slope = 0"
+# Col 39: Number of strata with at least two counted points
+# Col 40: Reserved
+# ---------------------------------------------------------------------------------------------
+
+
+m_stratstats = function(Matrix[Double] X, Matrix[Double] Y = matrix(0.0, rows=1,cols=1),
+ Matrix[Double] S = matrix(0.0, rows=1,cols=1), Matrix[Double] Xcid = matrix(0.0, rows=1,cols=1),
+ Matrix[Double] Ycid = matrix(0.0, rows=1,cols=1), Integer Scid = 1, Boolean debug = FALSE)
+ return(Matrix[Double] OutMtx)
+{
+ stratum_column_id = Scid;
+ XwithNaNs = X;
+ if (nrow(Y) == 1 & ncol(Y) == 1) {
+ YwithNaNs = XwithNaNs;
+ } else {
+ YwithNaNs = Y;
+ }
+
+ if (nrow(S) == 1 & ncol(S) == 1) {
+ SwithNaNs = XwithNaNs[, stratum_column_id];
+ } else {
+ SwithNaNsFull = S;
+ SwithNaNs = SwithNaNsFull[, stratum_column_id];
+ }
+
+ if (nrow(Xcid) == 1 & ncol(Xcid) == 1) {
+ Xcols = t(seq(1, ncol(XwithNaNs), 1));
+ } else {
+ Xcols = Xcid;
+ }
+
+ if (nrow(Ycid) == 1 & ncol(Ycid) == 1) {
+ Ycols = t(seq(1, ncol(YwithNaNs), 1));
+ } else {
+ Ycols = Ycid;
+ }
+ tXcols = t(Xcols);
+ tYcols = t(Ycols);
+
+ num_records = nrow (XwithNaNs);
+ num_attrs = ncol (XwithNaNs);
+ num_attrs_X = ncol (Xcols);
+ num_attrs_Y = ncol (Ycols);
+ num_attrs_XY = num_attrs_X * num_attrs_Y;
+
+ if (debug)
+ print ("Preparing the covariates...");
+
+ XnoNaNs = replace (target = XwithNaNs, pattern = NaN, replacement = 0);
+ YnoNaNs = replace (target = YwithNaNs, pattern = NaN, replacement = 0);
+ XNaNmask = (XwithNaNs == XwithNaNs);
+ YNaNmask = (YwithNaNs == YwithNaNs);
+ one_to_num_attrs_X = seq (1, num_attrs_X, 1);
+ one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
+ ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
+ ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);
+
+ ProjX_ctable = table (tXcols, one_to_num_attrs_X);
+ ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;
+
+ ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
+ ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;
+
+ X = XnoNaNs %*% ProjX;
+ Y = YnoNaNs %*% ProjY;
+ X_mask = XNaNmask %*% ProjX;
+ Y_mask = YNaNmask %*% ProjY;
+
+ if (debug)
+ print ("Preparing the strata...");
+ SnoNaNs = replace (target = SwithNaNs, pattern = NaN, replacement = 0);
+ S = round (SnoNaNs) * (SnoNaNs > 0);
+ Proj_good_stratumID = diag (S > 0);
+ Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
+ vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
+ vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
+ num_records_with_good_stratumID = nrow(Proj_good_stratumID)
+ one_to_num_records_with_good_stratumID = seq(1, num_records_with_good_stratumID, 1);
+
+ # Create a group-by summation matrix for records over stratum IDs
+ # "with_empty" means with stratum IDs that never occur in records
+
+ num_strata_with_empty = max(vector_of_good_stratumIDs);
+ StrataSummator_with_empty = table(vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
+ StrataSummator = removeEmpty(target = StrataSummator_with_empty, margin = "rows");
+ StrataSummator = StrataSummator %*% Proj_good_stratumID;
+ num_strata = nrow (StrataSummator);
+ num_empty_strata = num_strata_with_empty - num_strata;
+ if (debug){
+ print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");
+ print ("Computing the global single-variate statistics...");
+ }
+
+ cnt_X_global = colSums (X_mask);
+ cnt_Y_global = colSums (Y_mask);
+ avg_X_global = colSums (X) / cnt_X_global;
+ avg_Y_global = colSums (Y) / cnt_Y_global;
+ var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
+ var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
+ sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
+ stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
+ sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
+ stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
+
+ if (debug)
+ print ("Computing the stratified single-variate statistics...");
+ # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
+ Cnt_X_per_stratum = StrataSummator %*% X_mask;
+ Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
+ Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
+ Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
+ One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
+ One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
+ num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
+ num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
+
+ Sum_X_per_stratum = StrataSummator %*% X;
+ Sum_Y_per_stratum = StrataSummator %*% Y;
+
+ # Recompute some global statistics to exclude bad stratum-ID records
+ cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
+ cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
+ sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
+ sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
+ var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
+ var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;
+
+ # Compute the stratified statistics
+ var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
+ var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
+ sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
+ stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
+ sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
+ stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
+ r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
+ r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
+ adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
+ adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
+ fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
+ fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
+ p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
+ p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
+
+ if (debug)
+ print ("Computing the global bivariate statistics...");
+ # Compute the aggregate X vs. Y statistics and map them into proper positions
+ cnt_XY_rectangle = t(X_mask) %*% Y_mask;
+ sum_X_forXY_rectangle = t(X) %*% Y_mask;
+ sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
+ sum_Y_forXY_rectangle = t(X_mask) %*% Y;
+ sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
+ sum_XY_rectangle = t(X) %*% Y;
+ cnt_XY_global = matrix(cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ sum_X_forXY_global = matrix(sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ sum_XX_forXY_global = matrix(sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ sum_Y_forXY_global = matrix(sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ sum_YY_forXY_global = matrix(sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ sum_XY_global = matrix(sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+ ones_XY = matrix(1.0, rows = 1, cols = num_attrs_XY);
+
+ # Compute the global bivariate statistics for output
+ cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+ var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
+ var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+
+ slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
+ sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
+ sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
+ corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
+ r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
+ adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
+ sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2);
+ stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
+ sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2);
+ stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
+ fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
+ p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
+
+ if (debug)
+ print ("Computing the stratified bivariate statistics...");
+ # Create projections to "intermingle" X and Y into attribute pairs
+ Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
+ Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
+ ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1);
+ for (i in 1:num_attrs_X) {
+ start_cid = (i - 1) * num_attrs_Y + 1;
+ end_cid = i * num_attrs_Y;
+ Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
+ Proj_Y_to_XY [ , start_cid:end_cid] = diag(ones_Y_col);
+ }
+
+
+ # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
+ Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+ Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+ Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+ Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
+ Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
+ Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
+
+ Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
+ One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
+ num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
+
+ # Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records
+ cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
+ sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
+ sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
+ sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);
+
+ # Compute the stratified bivariate statistics
+ var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+ var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+ cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+
+ slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
+ sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
+ sqrt_failsafe_output_8 = sqrt_failsafe(sqrt_failsafe_input_8);
+ corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
+ r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
+ temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
+ adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
+ sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
+
+ stdev_errY_vs_X_stratified = sqrt_failsafe(sqrt_failsafe_input_9);
+ sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified;
+ stdev_slope_XY_stratified = sqrt_failsafe(sqrt_failsafe_input_10);
+ fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
+ p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
+
+ if (debug)
+ print ("Preparing the output matrix...");
+ OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
+ OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number
+ OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count
+ OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean
+ OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation
+ OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation
+ OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata
+ OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata
+ OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata
+
+ OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number
+ OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count
+ OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean
+ OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation
+ OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation
+ OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata
+ OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata
+ OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata
+
+ OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count
+ OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate)
+ OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
+ OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
+ OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
+ OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
+ OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared
+ OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
+
+ OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count
+ OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate)
+ OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
+ OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
+ OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
+ OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
+ OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
+ OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
+ OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2); # Number of strata with at least two counted points
+
+
+ OutMtx = t(OutMtx);
+}
+
+fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
+{
+ #TODO support matrix inputs in density functions
+ #tailprob = ifelse(df_1 >= 1 & df_2 >= 1 & fStat >= 0,
+ # pf(target = fStat, df1 = df_1, df2 = df_2, lower.tail=FALSE), NaN);
+
+ tailprob = matrix(NaN, nrow(fStat), ncol(fStat));
+ for (i in 1:nrow(fStat)) {
+ for (j in 1:ncol(fStat)) {
+ q = as.scalar (fStat [i, j]);
+ d1 = as.scalar (df_1 [i, j]);
+ d2 = as.scalar (df_2 [i, j]);
+ if( d1 >= 1 & d2 >= 1 & q >= 0 )
+ tailprob[i,j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
+ }
+ }
+}
+
+sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
+{
+ mask_A = (input_A >= 0);
+ prep_A = input_A * mask_A;
+ mask_A = mask_A * (prep_A == prep_A);
+ prep_A = replace (target = prep_A, pattern = NaN, replacement = 0);
+ output_A = sqrt (prep_A) / mask_A;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index e252b20..a063b47 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -248,6 +248,7 @@ public enum Builtins {
SPLIT_BALANCED("splitBalanced", true),
STABLE_MARRIAGE("stableMarriage", true),
STATSNA("statsNA", true),
+ STRATSTATS("stratstats", true),
STEPLM("steplm",true, ReturnType.MULTI_RETURN),
SQRT("sqrt", false),
SUM("sum", false),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java
new file mode 100644
index 0000000..bb94cce
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+
+@RunWith(value = Parameterized.class)
+public class BuiltinStratstatsTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "stratstats";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinStratstatsTest.class.getSimpleName() + "/";
+
+ protected String X;
+ protected int rows, cols;
+
+ public BuiltinStratstatsTest(String X, int cols, int rows){
+ this.X = X;
+ this.cols = cols;
+ this.rows = rows;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ return Arrays.asList(new Object[][] {
+ {"randfile", 10, 6},
+ {"randfile", 4, 18}});
+ }
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+ }
+
+ @Test
+ public void testStratstats(){
+ Types.ExecMode platformOld = setExecMode(ExecType.CP);
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ fullRScriptName = HOME + TEST_NAME + ".R";
+
+ programArgs = new String[]{
+ "-nvargs", "X=" + input("random.mtx"), "O=" + output("result"), "Y=" + input("doesnotexist"),};
+ rCmd = getRCmd(input("random.mtx"), expected("result"));
+
+ double[][] values = getRandomMatrix(rows, cols, 10,40, 1, 54321);
+ MatrixCharacteristics mc = new MatrixCharacteristics(rows,cols,-1,-1);
+ writeInputMatrixWithMTD("random", values, true, mc);
+
+ runTest(true, false, null, -1);
+ runRScript(true);
+
+ HashMap<CellIndex, Double> stats_SYSTEMDS = readDMLMatrixFromOutputDir("result");
+ HashMap<CellIndex, Double> stats_real = readRMatrixFromExpectedDir("result");
+
+ double tol = Math.pow(10, -3);
+ TestUtils.compareMatrices(stats_real, stats_SYSTEMDS, tol, "stats_real", "stats_SYSTEMDS", true);
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
index e72708f..b790a4c 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
@@ -44,7 +44,7 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase
private final static double sparsity1 = 0.7; //dense
private final static double sparsity2 = 0.1; //sparse
- private final static double eps = 3e-4;
+ private final static double eps = 4e-4;
private final static int H1 = 500;
private final static int H2 = 2;
diff --git a/src/test/scripts/functions/builtin/stratstats.R b/src/test/scripts/functions/builtin/stratstats.R
new file mode 100644
index 0000000..8476c82
--- /dev/null
+++ b/src/test/scripts/functions/builtin/stratstats.R
@@ -0,0 +1,281 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+library("Matrix")
+args <- commandArgs(TRUE)
+
+fStat_tailprob = function(fStat, df_1, df_2){
+ rows = nrow(fStat)
+ if(is.null(rows)){
+ rows = 1
+ fStat = t(matrix(fStat))
+ }
+
+ cols = ncol(fStat)
+ if(is.null(cols))
+ cols = length(fStat)
+
+ if(is.null(nrow(df_1)))
+ df_1 = t(matrix(df_1))
+
+ if(is.null(nrow(df_2)))
+ df_2 = t(matrix(df_2))
+
+ tailprob = fStat
+ for (i in 1:rows) {
+ for (j in 1:cols) {
+ q = fStat[i, j]
+ d1 = df_1[i, j]
+ d2 = df_2[i, j]
+ if (d1 >= 1 & d2 >= 1 & q >= 0)
+ tailprob[i, j] = pf(q, df1 = d1, df2 = d2, lower.tail=FALSE)
+ else
+ tailprob[i, j] = 0/0
+ }
+ }
+ return(tailprob)
+}
+
+sqrt_failsafe = function(input_A){
+ mask_A = (input_A >= 0)
+ prep_A = input_A * mask_A
+ mask_A = mask_A * (prep_A == prep_A)
+ #prep_A = replace (target = prep_A, pattern = NaN, replacement = 0)
+ output_A = sqrt (prep_A) / mask_A
+ return(output_A)
+}
+
+
+
+
+stratum_column_id = 1
+XwithNaNs = readMM(args[1])
+YwithNaNs = XwithNaNs
+SwithNaNs = XwithNaNs[, stratum_column_id]
+Xcols = c(1:ncol(XwithNaNs))
+Ycols = c(1:ncol(YwithNaNs))
+
+num_records = nrow(XwithNaNs)
+num_attrs = ncol(XwithNaNs)
+num_attrs_X = length(Xcols)
+num_attrs_Y = length(Ycols)
+num_attrs_XY = num_attrs_X * num_attrs_Y
+
+one_to_num_attrs_X = c(1:num_attrs_X)
+one_to_num_attrs_Y = c(1:num_attrs_Y)
+ProjX = matrix(0, nrow = num_attrs, ncol = num_attrs_X)
+ProjY = matrix(0, nrow = num_attrs, ncol = num_attrs_Y)
+
+ProjX_ctable = table(t(Xcols), one_to_num_attrs_X)
+ProjX[1:nrow(ProjX_ctable), ] = ProjX_ctable
+ProjY_ctable = table(t(Ycols), one_to_num_attrs_Y)
+ProjY[1:nrow(ProjY_ctable), ] = ProjY_ctable
+
+X = XwithNaNs %*% ProjX
+Y = XwithNaNs %*% ProjY
+S = round(SwithNaNs) * (SwithNaNs > 0)
+
+Proj_good_stratumID = diag(S > 0)
+Proj_good_stratumID = Proj_good_stratumID[rowSums(Proj_good_stratumID[])>0,]
+vector_of_good_stratumIDs = Proj_good_stratumID %*% S
+vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min(vector_of_good_stratumIDs))
+num_records_with_good_stratumID = nrow(Proj_good_stratumID)
+if (is.null(num_records_with_good_stratumID))
+ num_records_with_good_stratumID = 1
+one_to_num_records_with_good_stratumID = seq(1,num_records_with_good_stratumID,by=1)
+
+num_strata_with_empty = max(vector_of_good_stratumIDs)
+StrataSummator_with_empty = table(vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID)
+StrataSummator = StrataSummator_with_empty[rowSums(StrataSummator_with_empty[])>0,]
+StrataSummator = StrataSummator %*% Proj_good_stratumID
+num_strata = nrow(StrataSummator)
+num_empty_strata = num_strata_with_empty - num_strata
+
+XNaNmask = (XwithNaNs == XwithNaNs)
+YNaNmask = (YwithNaNs == YwithNaNs)
+X_mask = XNaNmask %*% ProjX
+Y_mask = YNaNmask %*% ProjY
+
+cnt_X_global = colSums(X_mask)
+cnt_Y_global = colSums(Y_mask)
+avg_X_global = colSums(X) / cnt_X_global
+avg_Y_global = colSums(Y) / cnt_Y_global
+var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global)
+var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global)
+sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1)
+stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1)
+sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
+stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2)
+
+Cnt_X_per_stratum = StrataSummator %*% X_mask
+Cnt_Y_per_stratum = StrataSummator %*% Y_mask
+Is_none_X_per_stratum = (Cnt_X_per_stratum == 0)
+Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0)
+One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum)
+One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum)
+num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum)
+num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum)
+
+Sum_X_per_stratum = StrataSummator %*% X
+Sum_Y_per_stratum = StrataSummator %*% Y
+
+cnt_X_with_good_stratumID = colSums(Cnt_X_per_stratum)
+cnt_Y_with_good_stratumID = colSums(Cnt_Y_per_stratum)
+sum_X_with_good_stratumID = colSums(Sum_X_per_stratum)
+sum_Y_with_good_stratumID = colSums(Sum_Y_per_stratum)
+var_sumX_with_good_stratumID = colSums(StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID
+var_sumY_with_good_stratumID = colSums(StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID
+
+var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum)
+var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum)
+sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)
+stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3)
+sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)
+stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4)
+r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID
+r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID
+adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1))
+adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1))
+fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata))
+fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata))
+p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata)
+p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata)
+
+cnt_XY_rectangle = t(X_mask) %*% Y_mask
+sum_X_forXY_rectangle = t(X) %*% Y_mask
+sum_XX_forXY_rectangle = t(X * X) %*% Y_mask
+sum_Y_forXY_rectangle = t(X_mask) %*% Y
+sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y)
+sum_XY_rectangle = t(X) %*% Y
+cnt_XY_global = matrix(t(cnt_XY_rectangle), nrow = 1, ncol = num_attrs_XY)
+sum_X_forXY_global = matrix(t(sum_X_forXY_rectangle), nrow = 1, byrow = TRUE)
+sum_XX_forXY_global = matrix(t(sum_XX_forXY_rectangle), nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_Y_forXY_global = matrix(t(sum_Y_forXY_rectangle), nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_YY_forXY_global = matrix(t(sum_YY_forXY_rectangle), nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_XY_global = matrix(sum_XY_rectangle, nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+ones_XY = matrix(1.0, nrow = 1, ncol = num_attrs_XY)
+
+cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global
+var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global
+var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global
+
+slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global
+sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global
+sqrt_failsafe_output_5 = sqrt_failsafe(sqrt_failsafe_input_5)
+corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5
+r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global)
+adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2)
+sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
+stdev_slope_XY_global = sqrt_failsafe(sqrt_failsafe_input_6)
+sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
+stdev_errY_vs_X_global = sqrt_failsafe(sqrt_failsafe_input_7)
+fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global)
+p_val_Y_vs_X_global = fStat_tailprob(fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2)
+
+Proj_X_to_XY = matrix(0.0, nrow = num_attrs_X, ncol = num_attrs_XY)
+Proj_Y_to_XY = matrix(0.0, nrow = num_attrs_Y, ncol = num_attrs_XY)
+ones_Y_col = matrix(1.0, nrow = num_attrs_Y, ncol = 1)
+for (i in 1:num_attrs_X) {
+ start_cid = (i - 1) * num_attrs_Y + 1
+ end_cid = i * num_attrs_Y
+ Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col)
+ Proj_Y_to_XY [ , start_cid:end_cid] = diag(nrow=nrow(ones_Y_col), ncol=nrow(ones_Y_col))
+}
+
+Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY))
+Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY))
+Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY))
+
+Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0)
+One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum)
+num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum)
+
+cnt_XY_with_good_stratumID = colSums(Cnt_XY_per_stratum)
+sum_XX_forXY_with_good_stratumID = colSums(Sum_XX_forXY_per_stratum)
+sum_YY_forXY_with_good_stratumID = colSums(Sum_YY_forXY_per_stratum)
+sum_XY_with_good_stratumID = colSums(Sum_XY_per_stratum)
+
+var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums(Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums(Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums(Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+
+slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified
+sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified
+sqrt_failsafe_output_8 = sqrt_failsafe(sqrt_failsafe_input_8)
+corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8
+r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified)
+temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
+adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata)
+sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified
+
+if(all(var_sumY_forXY_stratified == 0)){
+ sqrt_failsafe_input_9 = var_sumY_forXY_stratified
+}
+
+stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9)
+sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified
+stdev_slope_XY_stratified = sqrt_failsafe(sqrt_failsafe_input_10)
+#stdev_slope_XY_stratified[is.na(stdev_slope_XY_stratified)] = NaN
+fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified)
+p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
+
+OutMtx = matrix(0.0, nrow = 40, ncol = num_attrs_XY)
+OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY
+OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY
+OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY
+OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY
+OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY
+OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY
+OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY
+OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY
+
+OutMtx [11, ] = Ycols %*% Proj_Y_to_XY
+OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY
+OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY
+OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY
+OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY
+OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY
+OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY
+OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY
+
+OutMtx [21, ] = cnt_XY_global
+OutMtx [22, ] = slope_XY_global
+OutMtx [23, ] = stdev_slope_XY_global
+OutMtx [24, ] = corr_XY_global
+OutMtx [25, ] = stdev_errY_vs_X_global
+OutMtx [26, ] = r_sqr_X_vs_Y_global
+OutMtx [27, ] = adj_r_sqr_X_vs_Y_global
+OutMtx [28, ] = p_val_Y_vs_X_global
+
+OutMtx [31, ] = cnt_XY_with_good_stratumID
+OutMtx [32, ] = slope_XY_stratified
+OutMtx [33, ] = stdev_slope_XY_stratified
+OutMtx [34, ] = corr_XY_stratified
+OutMtx [35, ] = stdev_errY_vs_X_stratified
+OutMtx [36, ] = r_sqr_X_vs_Y_stratified
+OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified
+OutMtx [38, ] = p_val_Y_vs_X_stratified
+OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2)
+
+OutMtx = t(OutMtx)
+writeMM(as(OutMtx, "CsparseMatrix"), file=args[2])
diff --git a/src/test/scripts/functions/builtin/stratstats.dml b/src/test/scripts/functions/builtin/stratstats.dml
new file mode 100644
index 0000000..234fb40
--- /dev/null
+++ b/src/test/scripts/functions/builtin/stratstats.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+inX = read($X)
+inY = read($X)
+inS = read($X)
+outMat = stratstats(X=inX, Y=inY, S=inS)
+write(outMat, $O, sparse=TRUE)