You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2022/02/15 15:12:09 UTC
[systemds] branch main updated: [MINOR] Removing null values from mean/median computation - Instead of replacing nulls with zeros for computation now we remove the rows with nulls and then compute column mean/median values
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 8e49d69 [MINOR] Removing null values from mean/median computation - Instead of replacing nulls with zeros for computation now we remove the rows with nulls and then compute column mean/median values
8e49d69 is described below
commit 8e49d695af0c31bead989513de2f3d7bc5dc05e2
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Tue Feb 15 16:07:59 2022 +0100
[MINOR] Removing null values from mean/median computation
- Instead of replacing nulls with zeros for computation now we remove
the rows with nulls and then compute column mean/median values
---
scripts/builtin/imputeByMean.dml | 13 ++++++++++---
scripts/builtin/imputeByMedian.dml | 16 ++++++++--------
scripts/builtin/mice.dml | 21 ++++++++++++++-------
src/test/scripts/functions/builtin/meanImputation.R | 4 ++--
.../scripts/functions/builtin/medianImputation.R | 4 ++--
5 files changed, 36 insertions(+), 22 deletions(-)
diff --git a/scripts/builtin/imputeByMean.dml b/scripts/builtin/imputeByMean.dml
index 0697012..210f650 100644
--- a/scripts/builtin/imputeByMean.dml
+++ b/scripts/builtin/imputeByMean.dml
@@ -40,10 +40,17 @@
m_imputeByMean = function(Matrix[Double] X, Matrix[Double] mask)
return(Matrix[Double] X, Matrix[Double] imputedVec)
{
- nX = X*(mask==0)
- nX = replace(target=nX, pattern=NaN, replacement=0);
+
# mean imputation
- colMean = colMeans(nX)
+ colMean = matrix(0, rows=1, cols=ncol(X))
+ parfor(i in 1:ncol(X))
+ {
+ if(as.scalar(mask[1, i]) == 0)
+ {
+ nX = removeEmpty(target=X[, i], margin="rows", select = (is.na(X[, i]) == 0))
+ colMean[1, i] = mean(nX)
+ }
+ }
if(sum(mask) > 0)
{
diff --git a/scripts/builtin/imputeByMedian.dml b/scripts/builtin/imputeByMedian.dml
index 6e74e52..c40c9b1 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -45,16 +45,16 @@ return(Matrix[Double] X, Matrix[Double] imputedVec)
# return(List[Unknown] out)
{
- nX = X * (mask==0)
- nX = replace(target=nX, pattern=NaN, replacement=0);
- cols = ncol(nX)
+ cols = ncol(X)
# median imputation
colMedian = matrix(0, 1, cols)
- for(i in 1:cols, check=0) {
- if(sum(nX[,i]) == 0)
- colMedian[1, i] = 0
- else
- colMedian[1, i] = median(nX[,i])
+ parfor(i in 1:ncol(X))
+ {
+ if(as.scalar(mask[1, i]) == 0)
+ {
+ nX = removeEmpty(target=X[, i], margin="rows", select = (is.na(X[, i]) == 0))
+ colMedian[1, i] = median(nX)
+ }
}
if(sum(mask) > 0)
{
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index b9343c0..aaa41c7 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -82,7 +82,7 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+index+"]}";
[dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
-
+ dist = colDist(X1, cMask) # number of distinct items in categorical features
for(k in 1:iter) # start iterative imputation
{
betaList = list()
@@ -90,7 +90,6 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
Mask_Filled = Mask1 # use this to store predictions for missing values
weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold
inverseMask = Mask1 == 0
- dist = colDist(X1, cMask) # number of distinct items in categorical features
meta = rbind(meta, dist)
i=1; j=1; in_c=1;
@@ -182,11 +181,19 @@ m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
}
-colDist = function(Matrix[Double] X, Matrix[Double] mask)
-return (Matrix[Double] dist) {
- catCols = X * mask
- colDist = colMaxs(catCols)
- dist = (mask == 0) + colDist
+colDist= function(Matrix[Double] X, Matrix[Double] mask)
+return (Matrix[Double] dist){
+
+ dist = matrix(1, 1, ncol(X))
+ X = replace(target=X, pattern=0, replacement=max(X)+1)
+ parfor(i in 1:ncol(X))
+ {
+ if(as.scalar(mask[,i]) == 1)
+ {
+ distT = table(X[, i], 1)
+ dist[1, i] = sum(distT != 0)
+ }
+ }
}
getInitialImputation = function(Matrix[Double] X, Matrix[Double] mask)
diff --git a/src/test/scripts/functions/builtin/meanImputation.R b/src/test/scripts/functions/builtin/meanImputation.R
index b3d35a9..62304c7 100644
--- a/src/test/scripts/functions/builtin/meanImputation.R
+++ b/src/test/scripts/functions/builtin/meanImputation.R
@@ -32,8 +32,8 @@ mode = Mode(Salaries$yrs.since.phd, na.rm = TRUE)
Salaries$yrs.since.phd[is.na(Salaries$yrs.since.phd)]<-mode
t = Salaries$yrs.service
-t[is.na(t)]<-0
-mean = mean(t)
+
+mean = mean(t, na.rm = TRUE)
Salaries$yrs.service[is.na(Salaries$yrs.service)]<-mean
output = cbind(Salaries$yrs.since.phd, Salaries$yrs.service)
diff --git a/src/test/scripts/functions/builtin/medianImputation.R b/src/test/scripts/functions/builtin/medianImputation.R
index bd2f1c5..7785545 100644
--- a/src/test/scripts/functions/builtin/medianImputation.R
+++ b/src/test/scripts/functions/builtin/medianImputation.R
@@ -32,8 +32,8 @@ mode = Mode(Salaries$yrs.since.phd, na.rm = TRUE)
Salaries$yrs.since.phd[is.na(Salaries$yrs.since.phd)]<-mode
t = Salaries$yrs.service
-t[is.na(t)]<-0
-median = median(t)
+
+median = median(t, na.rm = TRUE)
Salaries$yrs.service[is.na(Salaries$yrs.service)]<-median
output = cbind(Salaries$yrs.since.phd, Salaries$yrs.service)