You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/07/29 21:21:34 UTC

[systemds] branch master updated: [SYSTEMDS-2845] New builtin function stratified statistics

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new b550f69  [SYSTEMDS-2845] New builtin function stratified statistics
b550f69 is described below

commit b550f69ac9c16f5207e395f0d24bc34cd43da461
Author: juulez <Ju...@student.tugraz.at>
AuthorDate: Thu Jul 29 22:37:45 2021 +0200

    [SYSTEMDS-2845] New builtin function stratified statistics
    
    AMLS project SS2021.
    Closes #1354.
---
 scripts/builtin/stratstats.dml                     | 379 +++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../functions/builtin/BuiltinStratstatsTest.java   |  95 ++++++
 .../codegenalg/partone/AlgorithmAutoEncoder.java   |   2 +-
 src/test/scripts/functions/builtin/stratstats.R    | 281 +++++++++++++++
 src/test/scripts/functions/builtin/stratstats.dml  |  26 ++
 6 files changed, 783 insertions(+), 1 deletion(-)

diff --git a/scripts/builtin/stratstats.dml b/scripts/builtin/stratstats.dml
new file mode 100644
index 0000000..13500cb
--- /dev/null
+++ b/scripts/builtin/stratstats.dml
@@ -0,0 +1,379 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# X     Double  ---     Matrix X that has all 1-st covariates
+# Y     Double  " "     Matrix Y that has all 2-nd covariates
+#                       the default value " " means "use X in place of Y"
+# S     Double  " "     Matrix S that has the stratum column
+#                       the default value " " means "use X in place of S"
+# Xcid  Double  " "     1-st covariate X-column indices
+#                       the default value " " means "use columns 1 : ncol(X)"
+# Ycid  Double  " "     2-nd covariate Y-column indices
+#                       the default value " " means "use columns 1 : ncol(Y)"
+# Scid  Int      1      Column index of the stratum column in S
+# ---------------------------------------------------------------------------------------------
+ 
+
+# OUTPUT MATRIX:
+# ---------------------------------------------------------------------------------------------
+# OutMtx          Double   ---     Output matrix, one row per each distinct pair
+#                                  (1st covariante, 2nd covariante)
+#                                  40 columns containing the following information:
+#                 Col 01: 1st covariate X-column number
+#                 Col 02: 1st covariate global presence count
+#                 Col 03: 1st covariate global mean
+#                 Col 04: 1st covariate global standard deviation
+#                 Col 05: 1st covariate stratified standard deviation
+#                 Col 06: R-squared, 1st covariate vs. strata
+#                 Col 07: adjusted R-squared, 1st covariate vs. strata
+#                 Col 08: P-value, 1st covariate vs. strata
+#                 Col 09-10: Reserved
+#                 Col 11: 2nd covariate Y-column number
+#                 Col 12: 2nd covariate global presence count
+#                 Col 13: 2nd covariate global mean
+#                 Col 14: 2nd covariate global standard deviation
+#                 Col 15: 2nd covariate stratified standard deviation
+#                 Col 16: R-squared, 2nd covariate vs. strata
+#                 Col 17: adjusted R-squared, 2nd covariate vs. strata
+#                 Col 18: P-value, 2nd covariate vs. strata
+#                 Col 19-20: Reserved
+#                 Col 21: Global 1st & 2nd covariate presence count
+#                 Col 22: Global regression slope (2nd vs. 1st covariate)
+#                 Col 23: Global regression slope standard deviation
+#                 Col 24: Global correlation = +/- sqrt(R-squared)
+#                 Col 25: Global residual standard deviation
+#                 Col 26: Global R-squared
+#                 Col 27: Global adjusted R-squared
+#                 Col 28: Global P-value for hypothesis "slope = 0"
+#                 Col 29-30: Reserved
+#                 Col 31: Stratified 1st & 2nd covariate presence count
+#                 Col 32: Stratified regression slope (2nd vs. 1st covariate)
+#                 Col 33: Stratified regression slope standard deviation
+#                 Col 34: Stratified correlation = +/- sqrt(R-squared)
+#                 Col 35: Stratified residual standard deviation
+#                 Col 36: Stratified R-squared
+#                 Col 37: Stratified adjusted R-squared
+#                 Col 38: Stratified P-value for hypothesis "slope = 0"
+#                 Col 39: Number of strata with at least two counted points
+#                 Col 40: Reserved
+# ---------------------------------------------------------------------------------------------
+
+
+m_stratstats = function(Matrix[Double] X, Matrix[Double] Y = matrix(0.0, rows=1,cols=1),
+  Matrix[Double] S = matrix(0.0, rows=1,cols=1), Matrix[Double] Xcid = matrix(0.0, rows=1,cols=1),
+  Matrix[Double] Ycid = matrix(0.0, rows=1,cols=1), Integer Scid = 1, Boolean debug = FALSE)
+  return(Matrix[Double] OutMtx)
+{
+  stratum_column_id = Scid;
+  XwithNaNs = X;
+  if (nrow(Y) == 1 & ncol(Y) == 1) {
+    YwithNaNs = XwithNaNs;
+  } else {
+    YwithNaNs = Y;
+  }
+
+  if (nrow(S) == 1 & ncol(S) == 1) {
+    SwithNaNs = XwithNaNs[, stratum_column_id];
+  } else {
+    SwithNaNsFull = S;
+    SwithNaNs = SwithNaNsFull[, stratum_column_id];
+  }
+
+  if (nrow(Xcid) == 1 & ncol(Xcid) == 1) {
+    Xcols = t(seq(1, ncol(XwithNaNs), 1));
+  } else {
+    Xcols = Xcid;
+  }
+
+  if (nrow(Ycid) == 1 & ncol(Ycid) == 1) {
+    Ycols = t(seq(1, ncol(YwithNaNs), 1));
+  } else {
+    Ycols = Ycid;
+  }
+  tXcols = t(Xcols);
+  tYcols = t(Ycols);
+
+  num_records  = nrow (XwithNaNs);
+  num_attrs    = ncol (XwithNaNs);
+  num_attrs_X  = ncol (Xcols);
+  num_attrs_Y  = ncol (Ycols);
+  num_attrs_XY = num_attrs_X * num_attrs_Y;
+
+  if (debug)
+    print ("Preparing the covariates...");
+
+  XnoNaNs = replace (target = XwithNaNs, pattern = NaN, replacement = 0);
+  YnoNaNs = replace (target = YwithNaNs, pattern = NaN, replacement = 0);
+  XNaNmask = (XwithNaNs == XwithNaNs);
+  YNaNmask = (YwithNaNs == YwithNaNs);
+  one_to_num_attrs_X = seq (1, num_attrs_X, 1);
+  one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
+  ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
+  ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);
+
+  ProjX_ctable = table (tXcols, one_to_num_attrs_X);
+  ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;
+
+  ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
+  ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;
+
+  X = XnoNaNs %*% ProjX;
+  Y = YnoNaNs %*% ProjY;
+  X_mask = XNaNmask %*% ProjX;
+  Y_mask = YNaNmask %*% ProjY;
+
+  if (debug)
+    print ("Preparing the strata...");
+  SnoNaNs = replace (target = SwithNaNs, pattern = NaN, replacement = 0);
+  S = round (SnoNaNs) * (SnoNaNs > 0);
+  Proj_good_stratumID = diag (S > 0);
+  Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
+  vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
+  vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
+  num_records_with_good_stratumID = nrow(Proj_good_stratumID)
+  one_to_num_records_with_good_stratumID = seq(1, num_records_with_good_stratumID, 1);
+
+  # Create a group-by summation matrix for records over stratum IDs
+  # "with_empty" means with stratum IDs that never occur in records
+
+  num_strata_with_empty = max(vector_of_good_stratumIDs);
+  StrataSummator_with_empty = table(vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
+  StrataSummator = removeEmpty(target = StrataSummator_with_empty, margin = "rows");
+  StrataSummator = StrataSummator %*% Proj_good_stratumID;
+  num_strata = nrow (StrataSummator);
+  num_empty_strata = num_strata_with_empty - num_strata;
+  if (debug){
+    print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");
+    print ("Computing the global single-variate statistics...");
+  }
+
+  cnt_X_global = colSums (X_mask);
+  cnt_Y_global = colSums (Y_mask);
+  avg_X_global = colSums (X) / cnt_X_global;
+  avg_Y_global = colSums (Y) / cnt_Y_global;
+  var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
+  var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
+  sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
+  stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
+  sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
+  stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
+
+  if (debug)
+    print ("Computing the stratified single-variate statistics...");
+  # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
+  Cnt_X_per_stratum = StrataSummator %*% X_mask;
+  Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
+  Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
+  Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
+  One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
+  One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
+  num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
+  num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
+
+  Sum_X_per_stratum  = StrataSummator %*% X;
+  Sum_Y_per_stratum  = StrataSummator %*% Y;
+
+  # Recompute some global statistics to exclude bad stratum-ID records
+  cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
+  cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
+  sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
+  sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
+  var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
+  var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;
+
+  # Compute the stratified statistics
+  var_sumX_stratified   = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
+  var_sumY_stratified   = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
+  sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
+  stdev_X_stratified    = sqrt_failsafe (sqrt_failsafe_input_3);
+  sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
+  stdev_Y_stratified    = sqrt_failsafe (sqrt_failsafe_input_4);
+  r_sqr_X_vs_strata     = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
+  r_sqr_Y_vs_strata     = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
+  adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
+  adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
+  fStat_X_vs_strata     = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
+  fStat_Y_vs_strata     = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
+  p_val_X_vs_strata     = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
+  p_val_Y_vs_strata     = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
+
+  if (debug)
+    print ("Computing the global bivariate statistics...");
+  # Compute the aggregate X vs. Y statistics and map them into proper positions
+  cnt_XY_rectangle       = t(X_mask) %*% Y_mask;
+  sum_X_forXY_rectangle  = t(X)      %*% Y_mask;
+  sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask;
+  sum_Y_forXY_rectangle  = t(X_mask) %*% Y;
+  sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
+  sum_XY_rectangle       = t(X)      %*% Y;
+  cnt_XY_global       = matrix(cnt_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  sum_X_forXY_global  = matrix(sum_X_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  sum_XX_forXY_global = matrix(sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  sum_Y_forXY_global  = matrix(sum_Y_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  sum_YY_forXY_global = matrix(sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  sum_XY_global       = matrix(sum_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
+  ones_XY = matrix(1.0, rows = 1, cols = num_attrs_XY);
+
+  # Compute the global bivariate statistics for output
+  cov_sumX_sumY_global    = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+  var_sumX_forXY_global   = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
+  var_sumY_forXY_global   = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
+
+  slope_XY_global         = cov_sumX_sumY_global / var_sumX_forXY_global;
+  sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
+  sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
+  corr_XY_global          = cov_sumX_sumY_global / sqrt_failsafe_output_5;
+  r_sqr_X_vs_Y_global     = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
+  adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
+  sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2);
+  stdev_slope_XY_global   = sqrt_failsafe (sqrt_failsafe_input_6);
+  sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2);
+  stdev_errY_vs_X_global  = sqrt_failsafe (sqrt_failsafe_input_7);
+  fStat_Y_vs_X_global     = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
+  p_val_Y_vs_X_global     = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
+
+  if (debug)
+    print ("Computing the stratified bivariate statistics...");
+  # Create projections to "intermingle" X and Y into attribute pairs
+  Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
+  Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
+  ones_Y_col   = matrix (1.0, rows = num_attrs_Y, cols = 1);
+  for (i in 1:num_attrs_X) {
+    start_cid = (i - 1) * num_attrs_Y + 1;
+    end_cid = i * num_attrs_Y;
+    Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
+    Proj_Y_to_XY [ , start_cid:end_cid] = diag(ones_Y_col);
+  }
+
+
+  # Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
+  Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+  Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+  Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
+  Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
+  Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
+  Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
+
+  Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
+  One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
+  num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
+
+  # Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records
+  cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
+  sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
+  sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
+  sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);
+
+  # Compute the stratified bivariate statistics
+  var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+  var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+  cov_sumX_sumY_stratified  = sum_XY_with_good_stratumID       - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
+
+  slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
+  sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
+  sqrt_failsafe_output_8 = sqrt_failsafe(sqrt_failsafe_input_8);
+  corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
+  r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
+  temp_X_vs_Y_stratified  = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
+  adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
+  sqrt_failsafe_input_9  = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
+
+  stdev_errY_vs_X_stratified  = sqrt_failsafe(sqrt_failsafe_input_9);
+  sqrt_failsafe_input_10 = sqrt_failsafe_input_9  / var_sumX_forXY_stratified;
+  stdev_slope_XY_stratified   = sqrt_failsafe(sqrt_failsafe_input_10);
+  fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
+  p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
+
+  if (debug)
+    print ("Preparing the output matrix...");
+  OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
+  OutMtx [ 1, ] = Xcols                 %*% Proj_X_to_XY;  # 1st covariate column number
+  OutMtx [ 2, ] = cnt_X_global          %*% Proj_X_to_XY;  # 1st covariate global presence count
+  OutMtx [ 3, ] = avg_X_global          %*% Proj_X_to_XY;  # 1st covariate global mean
+  OutMtx [ 4, ] = stdev_X_global        %*% Proj_X_to_XY;  # 1st covariate global standard deviation
+  OutMtx [ 5, ] = stdev_X_stratified    %*% Proj_X_to_XY;  # 1st covariate stratified standard deviation
+  OutMtx [ 6, ] = r_sqr_X_vs_strata     %*% Proj_X_to_XY;  # R-squared, 1st covariate vs. strata
+  OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY;  # adjusted R-squared, 1st covariate vs. strata
+  OutMtx [ 8, ] = p_val_X_vs_strata     %*% Proj_X_to_XY;  # P-value, 1st covariate vs. strata
+
+  OutMtx [11, ] = Ycols                 %*% Proj_Y_to_XY;  # 2nd covariate column number
+  OutMtx [12, ] = cnt_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global presence count
+  OutMtx [13, ] = avg_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global mean
+  OutMtx [14, ] = stdev_Y_global        %*% Proj_Y_to_XY;  # 2nd covariate global standard deviation
+  OutMtx [15, ] = stdev_Y_stratified    %*% Proj_Y_to_XY;  # 2nd covariate stratified standard deviation
+  OutMtx [16, ] = r_sqr_Y_vs_strata     %*% Proj_Y_to_XY;  # R-squared, 2nd covariate vs. strata
+  OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY;  # adjusted R-squared, 2nd covariate vs. strata
+  OutMtx [18, ] = p_val_Y_vs_strata     %*% Proj_Y_to_XY;  # P-value, 2nd covariate vs. strata
+
+  OutMtx [21, ] = cnt_XY_global;               # Global 1st & 2nd covariate presence count
+  OutMtx [22, ] = slope_XY_global;             # Global regression slope (2nd vs. 1st covariate)
+  OutMtx [23, ] = stdev_slope_XY_global;       # Global regression slope standard deviation
+  OutMtx [24, ] = corr_XY_global;              # Global correlation = +/- sqrt(R-squared)
+  OutMtx [25, ] = stdev_errY_vs_X_global;      # Global residual standard deviation
+  OutMtx [26, ] = r_sqr_X_vs_Y_global;         # Global R-squared
+  OutMtx [27, ] = adj_r_sqr_X_vs_Y_global;     # Global adjusted R-squared
+  OutMtx [28, ] = p_val_Y_vs_X_global;         # Global P-value for hypothesis "slope = 0"
+
+  OutMtx [31, ] = cnt_XY_with_good_stratumID;  # Stratified 1st & 2nd covariate presence count
+  OutMtx [32, ] = slope_XY_stratified;         # Stratified regression slope (2nd vs. 1st covariate)
+  OutMtx [33, ] = stdev_slope_XY_stratified;   # Stratified regression slope standard deviation
+  OutMtx [34, ] = corr_XY_stratified;          # Stratified correlation = +/- sqrt(R-squared)
+  OutMtx [35, ] = stdev_errY_vs_X_stratified;  # Stratified residual standard deviation
+  OutMtx [36, ] = r_sqr_X_vs_Y_stratified;     # Stratified R-squared
+  OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
+  OutMtx [38, ] = p_val_Y_vs_X_stratified;     # Stratified P-value for hypothesis "slope = 0"
+  OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2);  # Number of strata with at least two counted points
+
+
+  OutMtx = t(OutMtx);
+}
+
+fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
+{
+  #TODO support matrix inputs in density functions 
+  #tailprob = ifelse(df_1 >= 1 & df_2 >= 1 & fStat >= 0,
+  #  pf(target = fStat, df1 = df_1, df2 = df_2, lower.tail=FALSE), NaN);
+
+  tailprob = matrix(NaN, nrow(fStat), ncol(fStat));
+  for (i in 1:nrow(fStat)) {
+    for (j in 1:ncol(fStat)) {
+      q = as.scalar (fStat [i, j]);
+      d1 = as.scalar (df_1 [i, j]);
+      d2 = as.scalar (df_2 [i, j]);
+      if( d1 >= 1 & d2 >= 1 & q >= 0 )
+        tailprob[i,j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
+    }
+  }
+}
+
+sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
+{
+  mask_A = (input_A >= 0);
+  prep_A = input_A * mask_A;
+  mask_A = mask_A * (prep_A == prep_A);
+  prep_A = replace (target = prep_A, pattern = NaN, replacement = 0);
+  output_A = sqrt (prep_A) / mask_A;
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index e252b20..a063b47 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -248,6 +248,7 @@ public enum Builtins {
 	SPLIT_BALANCED("splitBalanced", true),
 	STABLE_MARRIAGE("stableMarriage", true),
 	STATSNA("statsNA", true),
+	STRATSTATS("stratstats", true),
 	STEPLM("steplm",true, ReturnType.MULTI_RETURN),
 	SQRT("sqrt", false),
 	SUM("sum", false),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java
new file mode 100644
index 0000000..bb94cce
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStratstatsTest.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+
+@RunWith(value = Parameterized.class)
+public class BuiltinStratstatsTest extends AutomatedTestBase {
+	private final static String TEST_NAME = "stratstats";
+	private final static String TEST_DIR = "functions/builtin/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinStratstatsTest.class.getSimpleName() + "/";
+
+	protected String X;
+	protected int rows, cols;
+
+	public BuiltinStratstatsTest(String X, int cols, int rows){
+		this.X = X;
+		this.cols = cols;
+		this.rows = rows;
+	}
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		return Arrays.asList(new Object[][] {
+			{"randfile", 10, 6},
+			{"randfile", 4, 18}});
+	}
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+	}
+
+	@Test
+	public void testStratstats(){
+		Types.ExecMode platformOld = setExecMode(ExecType.CP);
+		try {
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			fullRScriptName = HOME + TEST_NAME + ".R";
+
+			programArgs = new String[]{
+					"-nvargs", "X=" + input("random.mtx"), "O=" + output("result"), "Y=" + input("doesnotexist"),};
+			rCmd = getRCmd(input("random.mtx"), expected("result"));
+
+			double[][] values = getRandomMatrix(rows, cols, 10,40, 1, 54321);
+			MatrixCharacteristics mc = new MatrixCharacteristics(rows,cols,-1,-1);
+			writeInputMatrixWithMTD("random", values, true, mc);
+
+			runTest(true, false, null, -1);
+			runRScript(true);
+
+			HashMap<CellIndex, Double> stats_SYSTEMDS = readDMLMatrixFromOutputDir("result");
+			HashMap<CellIndex, Double> stats_real = readRMatrixFromExpectedDir("result");
+
+			double tol = Math.pow(10, -3);
+			TestUtils.compareMatrices(stats_real, stats_SYSTEMDS, tol, "stats_real", "stats_SYSTEMDS", true);
+		}
+		finally {
+			rtplatform = platformOld;
+		}
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
index e72708f..b790a4c 100644
--- a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
+++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmAutoEncoder.java
@@ -44,7 +44,7 @@ public class AlgorithmAutoEncoder extends AutomatedTestBase
 	
 	private final static double sparsity1 = 0.7; //dense
 	private final static double sparsity2 = 0.1; //sparse
-	private final static double eps       = 3e-4;
+	private final static double eps       = 4e-4;
 	
 	private final static int H1 = 500;
 	private final static int H2 = 2;
diff --git a/src/test/scripts/functions/builtin/stratstats.R b/src/test/scripts/functions/builtin/stratstats.R
new file mode 100644
index 0000000..8476c82
--- /dev/null
+++ b/src/test/scripts/functions/builtin/stratstats.R
@@ -0,0 +1,281 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+library("Matrix")
+args <- commandArgs(TRUE)
+
+fStat_tailprob = function(fStat, df_1, df_2){
+ rows = nrow(fStat)
+ if(is.null(rows)){
+   rows = 1
+   fStat = t(matrix(fStat))
+ }
+
+ cols = ncol(fStat)
+ if(is.null(cols))
+    cols = length(fStat)
+
+ if(is.null(nrow(df_1)))
+    df_1 = t(matrix(df_1))
+
+ if(is.null(nrow(df_2)))
+    df_2 = t(matrix(df_2))
+
+ tailprob = fStat
+ for (i in 1:rows) {
+   for (j in 1:cols) {
+     q = fStat[i, j]
+     d1 = df_1[i, j]
+     d2 = df_2[i, j]
+     if (d1 >= 1 & d2 >= 1 & q >= 0)
+        tailprob[i, j] = pf(q, df1 = d1, df2 = d2, lower.tail=FALSE)
+      else
+        tailprob[i, j] = 0/0
+    }
+   }
+   return(tailprob)
+}
+
+sqrt_failsafe = function(input_A){
+ mask_A = (input_A >= 0)
+ prep_A = input_A * mask_A
+ mask_A = mask_A * (prep_A == prep_A)
+ #prep_A = replace (target = prep_A, pattern = NaN, replacement = 0)
+ output_A = sqrt (prep_A) / mask_A
+ return(output_A)
+}
+
+
+
+
+stratum_column_id = 1
+XwithNaNs = readMM(args[1])
+YwithNaNs = XwithNaNs
+SwithNaNs = XwithNaNs[, stratum_column_id]
+Xcols = c(1:ncol(XwithNaNs))
+Ycols = c(1:ncol(YwithNaNs))
+
+num_records  = nrow(XwithNaNs)
+num_attrs    = ncol(XwithNaNs)
+num_attrs_X  = length(Xcols)
+num_attrs_Y  = length(Ycols)
+num_attrs_XY = num_attrs_X * num_attrs_Y
+
+one_to_num_attrs_X = c(1:num_attrs_X)
+one_to_num_attrs_Y = c(1:num_attrs_Y)
+ProjX = matrix(0, nrow = num_attrs, ncol = num_attrs_X)
+ProjY = matrix(0, nrow = num_attrs, ncol = num_attrs_Y)
+
+ProjX_ctable = table(t(Xcols), one_to_num_attrs_X)
+ProjX[1:nrow(ProjX_ctable), ] = ProjX_ctable
+ProjY_ctable = table(t(Ycols), one_to_num_attrs_Y)
+ProjY[1:nrow(ProjY_ctable), ] = ProjY_ctable
+
+X = XwithNaNs %*% ProjX
+Y = XwithNaNs %*% ProjY
+S = round(SwithNaNs) * (SwithNaNs > 0)
+
+Proj_good_stratumID = diag(S > 0)
+Proj_good_stratumID = Proj_good_stratumID[rowSums(Proj_good_stratumID[])>0,]
+vector_of_good_stratumIDs = Proj_good_stratumID %*% S
+vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min(vector_of_good_stratumIDs))
+num_records_with_good_stratumID = nrow(Proj_good_stratumID)
+if (is.null(num_records_with_good_stratumID))
+    num_records_with_good_stratumID = 1
+one_to_num_records_with_good_stratumID = seq(1,num_records_with_good_stratumID,by=1)
+
+num_strata_with_empty = max(vector_of_good_stratumIDs)
+StrataSummator_with_empty = table(vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID)
+StrataSummator = StrataSummator_with_empty[rowSums(StrataSummator_with_empty[])>0,]
+StrataSummator = StrataSummator %*% Proj_good_stratumID
+num_strata = nrow(StrataSummator)
+num_empty_strata = num_strata_with_empty - num_strata
+
+XNaNmask = (XwithNaNs == XwithNaNs)
+YNaNmask = (YwithNaNs == YwithNaNs)
+X_mask = XNaNmask %*% ProjX
+Y_mask = YNaNmask %*% ProjY
+
+cnt_X_global = colSums(X_mask)
+cnt_Y_global = colSums(Y_mask)
+avg_X_global = colSums(X) / cnt_X_global
+avg_Y_global = colSums(Y) / cnt_Y_global
+var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global)
+var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global)
+sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1)
+stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1)
+sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1)
+stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2)
+
+Cnt_X_per_stratum = StrataSummator %*% X_mask
+Cnt_Y_per_stratum = StrataSummator %*% Y_mask
+Is_none_X_per_stratum = (Cnt_X_per_stratum == 0)
+Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0)
+One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum)
+One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum)
+num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum)
+num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum)
+
+Sum_X_per_stratum  = StrataSummator %*% X
+Sum_Y_per_stratum  = StrataSummator %*% Y
+
+cnt_X_with_good_stratumID = colSums(Cnt_X_per_stratum)
+cnt_Y_with_good_stratumID = colSums(Cnt_Y_per_stratum)
+sum_X_with_good_stratumID = colSums(Sum_X_per_stratum)
+sum_Y_with_good_stratumID = colSums(Sum_Y_per_stratum)
+var_sumX_with_good_stratumID = colSums(StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID
+var_sumY_with_good_stratumID = colSums(StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID
+
+var_sumX_stratified   = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum)
+var_sumY_stratified   = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum)
+sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)
+stdev_X_stratified    = sqrt_failsafe (sqrt_failsafe_input_3)
+sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)
+stdev_Y_stratified    = sqrt_failsafe (sqrt_failsafe_input_4)
+r_sqr_X_vs_strata     = 1 - var_sumX_stratified / var_sumX_with_good_stratumID
+r_sqr_Y_vs_strata     = 1 - var_sumY_stratified / var_sumY_with_good_stratumID
+adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1))
+adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1))
+fStat_X_vs_strata     = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata))
+fStat_Y_vs_strata     = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata))
+p_val_X_vs_strata     = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata)
+p_val_Y_vs_strata     = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata)
+
+cnt_XY_rectangle       = t(X_mask) %*% Y_mask
+sum_X_forXY_rectangle  = t(X)      %*% Y_mask
+sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask
+sum_Y_forXY_rectangle  = t(X_mask) %*% Y
+sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y)
+sum_XY_rectangle       = t(X)      %*% Y
+cnt_XY_global       = matrix(t(cnt_XY_rectangle),       nrow = 1, ncol = num_attrs_XY)
+sum_X_forXY_global  = matrix(t(sum_X_forXY_rectangle), nrow = 1, byrow = TRUE)
+sum_XX_forXY_global = matrix(t(sum_XX_forXY_rectangle), nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_Y_forXY_global  = matrix(t(sum_Y_forXY_rectangle),  nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_YY_forXY_global = matrix(t(sum_YY_forXY_rectangle), nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+sum_XY_global       = matrix(sum_XY_rectangle,       nrow = 1, ncol = num_attrs_XY, byrow = TRUE)
+ones_XY = matrix(1.0, nrow = 1, ncol = num_attrs_XY)
+
+cov_sumX_sumY_global    = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global
+var_sumX_forXY_global   = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global
+var_sumY_forXY_global   = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global
+
+slope_XY_global         = cov_sumX_sumY_global / var_sumX_forXY_global
+sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global
+sqrt_failsafe_output_5 = sqrt_failsafe(sqrt_failsafe_input_5)
+corr_XY_global          = cov_sumX_sumY_global / sqrt_failsafe_output_5
+r_sqr_X_vs_Y_global     = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global)
+adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2)
+sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
+stdev_slope_XY_global   = sqrt_failsafe(sqrt_failsafe_input_6)
+sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
+stdev_errY_vs_X_global  = sqrt_failsafe(sqrt_failsafe_input_7)
+fStat_Y_vs_X_global     = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global)
+p_val_Y_vs_X_global     = fStat_tailprob(fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2)
+
+Proj_X_to_XY = matrix(0.0, nrow = num_attrs_X, ncol = num_attrs_XY)
+Proj_Y_to_XY = matrix(0.0, nrow = num_attrs_Y, ncol = num_attrs_XY)
+ones_Y_col   = matrix(1.0, nrow = num_attrs_Y, ncol = 1)
+for (i in 1:num_attrs_X) {
+    start_cid = (i - 1) * num_attrs_Y + 1
+    end_cid = i * num_attrs_Y
+    Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col)
+    Proj_Y_to_XY [ , start_cid:end_cid] = diag(nrow=nrow(ones_Y_col), ncol=nrow(ones_Y_col))
+}
+
+Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY))
+Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY))
+Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY))
+Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY))
+
+Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0)
+One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum)
+num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum)
+
+cnt_XY_with_good_stratumID = colSums(Cnt_XY_per_stratum)
+sum_XX_forXY_with_good_stratumID = colSums(Sum_XX_forXY_per_stratum)
+sum_YY_forXY_with_good_stratumID = colSums(Sum_YY_forXY_per_stratum)
+sum_XY_with_good_stratumID = colSums(Sum_XY_per_stratum)
+
+var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums(Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums(Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+cov_sumX_sumY_stratified  = sum_XY_with_good_stratumID       - colSums(Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum)
+
+slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified
+sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified
+sqrt_failsafe_output_8 = sqrt_failsafe(sqrt_failsafe_input_8)
+corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8
+r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified)
+temp_X_vs_Y_stratified  = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
+adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata)
+sqrt_failsafe_input_9  = temp_X_vs_Y_stratified * var_sumY_forXY_stratified
+
+if(all(var_sumY_forXY_stratified == 0)){
+  sqrt_failsafe_input_9 = var_sumY_forXY_stratified
+}
+
+stdev_errY_vs_X_stratified  = sqrt_failsafe (sqrt_failsafe_input_9)
+sqrt_failsafe_input_10 = sqrt_failsafe_input_9  / var_sumX_forXY_stratified
+stdev_slope_XY_stratified   = sqrt_failsafe(sqrt_failsafe_input_10)
+#stdev_slope_XY_stratified[is.na(stdev_slope_XY_stratified)] = NaN
+fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified)
+p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
+
+OutMtx = matrix(0.0, nrow = 40, ncol = num_attrs_XY)
+OutMtx [ 1, ] = Xcols                 %*% Proj_X_to_XY
+OutMtx [ 2, ] = cnt_X_global          %*% Proj_X_to_XY
+OutMtx [ 3, ] = avg_X_global          %*% Proj_X_to_XY
+OutMtx [ 4, ] = stdev_X_global        %*% Proj_X_to_XY
+OutMtx [ 5, ] = stdev_X_stratified    %*% Proj_X_to_XY
+OutMtx [ 6, ] = r_sqr_X_vs_strata     %*% Proj_X_to_XY
+OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY
+OutMtx [ 8, ] = p_val_X_vs_strata     %*% Proj_X_to_XY
+
+OutMtx [11, ] = Ycols                 %*% Proj_Y_to_XY
+OutMtx [12, ] = cnt_Y_global          %*% Proj_Y_to_XY
+OutMtx [13, ] = avg_Y_global          %*% Proj_Y_to_XY
+OutMtx [14, ] = stdev_Y_global        %*% Proj_Y_to_XY
+OutMtx [15, ] = stdev_Y_stratified    %*% Proj_Y_to_XY
+OutMtx [16, ] = r_sqr_Y_vs_strata     %*% Proj_Y_to_XY
+OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY
+OutMtx [18, ] = p_val_Y_vs_strata     %*% Proj_Y_to_XY
+
+OutMtx [21, ] = cnt_XY_global
+OutMtx [22, ] = slope_XY_global
+OutMtx [23, ] = stdev_slope_XY_global
+OutMtx [24, ] = corr_XY_global
+OutMtx [25, ] = stdev_errY_vs_X_global
+OutMtx [26, ] = r_sqr_X_vs_Y_global
+OutMtx [27, ] = adj_r_sqr_X_vs_Y_global
+OutMtx [28, ] = p_val_Y_vs_X_global
+
+OutMtx [31, ] = cnt_XY_with_good_stratumID
+OutMtx [32, ] = slope_XY_stratified
+OutMtx [33, ] = stdev_slope_XY_stratified
+OutMtx [34, ] = corr_XY_stratified
+OutMtx [35, ] = stdev_errY_vs_X_stratified
+OutMtx [36, ] = r_sqr_X_vs_Y_stratified
+OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified
+OutMtx [38, ] = p_val_Y_vs_X_stratified
+OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2)
+
+OutMtx = t(OutMtx)
+writeMM(as(OutMtx, "CsparseMatrix"), file=args[2])
diff --git a/src/test/scripts/functions/builtin/stratstats.dml b/src/test/scripts/functions/builtin/stratstats.dml
new file mode 100644
index 0000000..234fb40
--- /dev/null
+++ b/src/test/scripts/functions/builtin/stratstats.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+inX = read($X)
+inY = read($X)
+inS = read($X)
+outMat = stratstats(X=inX, Y=inY, S=inS)
+write(outMat, $O, sparse=TRUE)