You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/02/15 15:12:09 UTC

[systemds] branch main updated: [MINOR] Removing null values from mean/median computation - Instead of replacing nulls with zeros for computation now we remove the rows with nulls and then compute column mean/median values

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 8e49d69  [MINOR] Removing null values from mean/median computation   - Instead of replacing nulls with zeros for computation now we remove      the rows with nulls and then compute column mean/median values
8e49d69 is described below

commit 8e49d695af0c31bead989513de2f3d7bc5dc05e2
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Tue Feb 15 16:07:59 2022 +0100

    [MINOR] Removing null values from mean/median computation
      - Instead of replacing nulls with zeros for computation now we remove
         the rows with nulls and then compute column mean/median values
---
 scripts/builtin/imputeByMean.dml                    | 13 ++++++++++---
 scripts/builtin/imputeByMedian.dml                  | 16 ++++++++--------
 scripts/builtin/mice.dml                            | 21 ++++++++++++++-------
 src/test/scripts/functions/builtin/meanImputation.R |  4 ++--
 .../scripts/functions/builtin/medianImputation.R    |  4 ++--
 5 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/scripts/builtin/imputeByMean.dml b/scripts/builtin/imputeByMean.dml
index 0697012..210f650 100644
--- a/scripts/builtin/imputeByMean.dml
+++ b/scripts/builtin/imputeByMean.dml
@@ -40,10 +40,17 @@
 m_imputeByMean = function(Matrix[Double] X, Matrix[Double] mask)
 return(Matrix[Double] X, Matrix[Double] imputedVec)
 {
-  nX = X*(mask==0)
-  nX = replace(target=nX, pattern=NaN, replacement=0);
+
   #  mean imputation
-  colMean = colMeans(nX)
+  colMean = matrix(0, rows=1, cols=ncol(X))
+  parfor(i in 1:ncol(X))
+  {
+    if(as.scalar(mask[1, i]) == 0)
+    {
+      nX = removeEmpty(target=X[, i], margin="rows", select = (is.na(X[, i]) == 0))
+      colMean[1, i] = mean(nX)
+    }
+  }
 
   if(sum(mask) > 0)
   {
diff --git a/scripts/builtin/imputeByMedian.dml b/scripts/builtin/imputeByMedian.dml
index 6e74e52..c40c9b1 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -45,16 +45,16 @@ return(Matrix[Double] X,  Matrix[Double] imputedVec)
 # return(List[Unknown] out)
 {
 
-  nX = X * (mask==0)
-  nX = replace(target=nX, pattern=NaN, replacement=0);
-  cols = ncol(nX)
+  cols = ncol(X)
   #  median imputation
   colMedian = matrix(0, 1, cols)
-  for(i in 1:cols, check=0) {
-    if(sum(nX[,i]) == 0)
-      colMedian[1, i] = 0
-    else
-      colMedian[1, i] = median(nX[,i])
+  parfor(i in 1:ncol(X))
+  {
+    if(as.scalar(mask[1, i]) == 0)
+    {
+      nX = removeEmpty(target=X[, i], margin="rows", select = (is.na(X[, i]) == 0))
+      colMedian[1, i] = median(nX)
+    }
   }
   if(sum(mask) > 0)
   {
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index b9343c0..aaa41c7 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -82,7 +82,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
   # specifications for one-hot encoding of categorical features
   jspecDC = "{ids:true, dummycode:["+index+"]}";
   [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
-  
+  dist = colDist(X1, cMask) # number of distinct items in categorical features
   for(k in 1:iter) # start iterative imputation
   {
     betaList = list()
@@ -90,7 +90,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
     Mask_Filled = Mask1 # use this to store predictions for missing values
     weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold
     inverseMask = Mask1 == 0
-    dist = colDist(X1, cMask) # number of distinct items in categorical features
     meta = rbind(meta, dist)
     i=1; j=1; in_c=1;
 
@@ -182,11 +181,19 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
 }
 
 
-colDist = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] dist) {
-  catCols = X * mask
-  colDist = colMaxs(catCols)
-  dist = (mask == 0) + colDist
+colDist= function(Matrix[Double] X, Matrix[Double] mask)
+return (Matrix[Double] dist){
+ 
+  dist = matrix(1, 1, ncol(X))
+  X = replace(target=X, pattern=0, replacement=max(X)+1)
+  parfor(i in 1:ncol(X))
+  {
+    if(as.scalar(mask[,i]) == 1)
+    {
+      distT = table(X[, i], 1)
+      dist[1, i] = sum(distT != 0)
+    }
+  }
 }
 
 getInitialImputation = function(Matrix[Double] X, Matrix[Double] mask)
diff --git a/src/test/scripts/functions/builtin/meanImputation.R b/src/test/scripts/functions/builtin/meanImputation.R
index b3d35a9..62304c7 100644
--- a/src/test/scripts/functions/builtin/meanImputation.R
+++ b/src/test/scripts/functions/builtin/meanImputation.R
@@ -32,8 +32,8 @@ mode = Mode(Salaries$yrs.since.phd, na.rm = TRUE)
 Salaries$yrs.since.phd[is.na(Salaries$yrs.since.phd)]<-mode
 
 t = Salaries$yrs.service
-t[is.na(t)]<-0
-mean = mean(t)
+
+mean = mean(t, na.rm = TRUE)
 
 Salaries$yrs.service[is.na(Salaries$yrs.service)]<-mean
 output = cbind(Salaries$yrs.since.phd, Salaries$yrs.service)
diff --git a/src/test/scripts/functions/builtin/medianImputation.R b/src/test/scripts/functions/builtin/medianImputation.R
index bd2f1c5..7785545 100644
--- a/src/test/scripts/functions/builtin/medianImputation.R
+++ b/src/test/scripts/functions/builtin/medianImputation.R
@@ -32,8 +32,8 @@ mode = Mode(Salaries$yrs.since.phd, na.rm = TRUE)
 Salaries$yrs.since.phd[is.na(Salaries$yrs.since.phd)]<-mode
 
 t = Salaries$yrs.service
-t[is.na(t)]<-0
-median = median(t)
+
+median = median(t, na.rm = TRUE)
 
 Salaries$yrs.service[is.na(Salaries$yrs.service)]<-median
 output = cbind(Salaries$yrs.since.phd, Salaries$yrs.service)