You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/06 15:30:41 UTC
[systemds] branch master updated: [SYSTEMDS-2635] New builtin for
forward and backward NA filling
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new f4e2412 [SYSTEMDS-2635] New builtin for forward and backward NA filling
f4e2412 is described below
commit f4e2412bf15abdda41bf0d0099cc1d0a2c40c1f1
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Sun Sep 6 17:29:51 2020 +0200
[SYSTEMDS-2635] New builtin for forward and backward NA filling
Closes #1036.
---
scripts/builtin/na_locf.dml | 80 ++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../sysds/runtime/compress/BitmapEncoder.java | 4 +-
.../runtime/compress/CompressedMatrixBlock.java | 2 +-
.../test/functions/builtin/BuiltinNaLocfTest.java | 119 +++++++++++++++++++++
src/test/scripts/functions/builtin/na_locfTest.R | 34 ++++++
src/test/scripts/functions/builtin/na_locfTest.dml | 26 +++++
7 files changed, 263 insertions(+), 3 deletions(-)
diff --git a/scripts/builtin/na_locf.dml b/scripts/builtin/na_locf.dml
new file mode 100644
index 0000000..d211501
--- /dev/null
+++ b/scripts/builtin/na_locf.dml
@@ -0,0 +1,80 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function for imputing missing values using forward fill and backward fill techniques
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X Double --- Matrix X
+# option String "locf" String "locf" (last observation moved forward) to do forward fill
+# String "nocb" (next observation carried backward) to do backward fill
+# verbose Boolean FALSE to print output on screen
+# ---------------------------------------------------------------------------------------------
+
+#Output(s)
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# output Double --- Matrix with no missing values
+
+m_na_locf = function(Matrix[Double] X, String option = "locf", Boolean verbose = FALSE)
+ return(Matrix[Double] output)
+{
+ output = X
+ if(sum(is.nan(X)) > 0) {
+ if(option == "locf")
+ output = locf(X)
+ else
+ output = rev(locf(rev(X)))
+ }
+
+ if(verbose)
+ print(toString(output))
+}
+
+locf = function(Matrix[Double] X)
+ return(Matrix[Double] outputLocf)
+{
+ # store mask of missing values
+ mask = is.nan(X)
+
+ # replace NaN with a number i.e., zeros
+ X = replace(target=X, pattern = NaN, replacement = 0)
+
+ # use the cumsumprod built-in to do fill forward
+ output = matrix(0, nrow(X), ncol(X))
+ parfor(i in 1:ncol(X))
+ output[ ,i] = cumsumprod(cbind(X[,i],mask[,i]))
+
+ # if there are leading NAs
+ leading_NA = (output == 0) & (mask == 1)
+ outputR = matrix(0, nrow(X), ncol(X))
+
+ if(sum(leading_NA) > 0) {
+ # doing fill forward in reverse
+ parfor(i in 1:ncol(X))
+ outputR[ ,i] = rev(cumsumprod(rev(cbind(X[,i],leading_NA[,i]))))
+ }
+
+ outputLocf = (outputR * leading_NA) + output
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 28acc5b..9ff8f78 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -140,6 +140,7 @@ public enum Builtins {
MSVMPREDICT("msvmPredict", true),
MULTILOGREG("multiLogReg", true),
MULTILOGREGPREDICT("multiLogRegPredict", true),
+ NA_LOCF("na_locf", true),
NCOL("ncol", false),
NORMALIZE("normalize", true),
NROW("nrow", false),
diff --git a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
index e9be28a..6683d64 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
@@ -289,7 +289,7 @@ public class BitmapEncoder {
lengths.put(scaledValues[idx], lengths.get(scaledValues[idx]) + fullSizeOffsetsLists[idx].size());
}
else {
- Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+ Queue<IntArrayList> offsets = new LinkedList<>();
offsets.add(fullSizeOffsetsLists[idx]);
values.put(scaledValues[idx], offsets);
lengths.put(scaledValues[idx], fullSizeOffsetsLists[idx].size());
@@ -347,7 +347,7 @@ public class BitmapEncoder {
lengths.put(array, lengths.get(array) + fullSizeOffsetsLists[idx / numColumns].size());
}
else {
- Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+ Queue<IntArrayList> offsets = new LinkedList<>();
offsets.add(fullSizeOffsetsLists[idx / numColumns]);
values.put(array, offsets);
lengths.put(array, fullSizeOffsetsLists[idx / numColumns].size());
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index db2b8ba..aaf75901 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -1638,7 +1638,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
@Override
public List<ColGroup> call() {
- List<ColGroup> res = new ArrayList<ColGroup>();
+ List<ColGroup> res = new ArrayList<>();
for(ColGroup x : _colGroups) {
res.add(x.scalarOperation(_sop));
}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java
new file mode 100644
index 0000000..8f5b1e1
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinNaLocfTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "na_locfTest";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinNaLocfTest.class.getSimpleName() + "/";
+
+ private final static double eps = 1e-10;
+ private final static int rows = 25;
+ private final static int cols = 25;
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"O"}));
+ }
+
+ @Test
+ public void tesLocfNoLineageCP() {
+ runLocfTest(false, "locf", LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void tesLocfLineageCP() {
+ runLocfTest(true, "locf", LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void tesLocfNoLineageSPARK() {
+ runLocfTest(false,"locf", LopProperties.ExecType.SPARK);
+ }
+
+ @Test
+ public void tesLocfLineageSPARK() {
+ runLocfTest(true,"locf", LopProperties.ExecType.SPARK);
+ }
+
+ @Test
+ public void tesnocbNoLineageCP() {
+ runLocfTest(false, "nocb", LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void tesnocbLineageCP() {
+ runLocfTest(true, "nocb", LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void tesnocbNoLineageSPARK() {
+ runLocfTest(false,"nocb", LopProperties.ExecType.SPARK);
+ }
+
+ @Test
+ public void tesnocbLineageSPARK() {
+ runLocfTest(true,"nocb", LopProperties.ExecType.SPARK);
+ }
+
+ private void runLocfTest(boolean lineage, String option, LopProperties.ExecType instType) {
+ Types.ExecMode platformOld = setExecMode(instType);
+ try {
+ setOutputBuffering(false);
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] {"-nvargs", "X=" + input("A"), "option="+option, "O=" + output("O")};
+ if(lineage) {
+ String[] lin = new String[] {"-stats", "-lineage", ReuseCacheType.REUSE_HYBRID.name().toLowerCase()};
+ programArgs = (String[]) ArrayUtils.addAll(programArgs, lin);
+ }
+
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ rCmd = getRCmd(inputDir(), option, expectedDir());
+
+ //generate actual dataset
+ double[][] A = getRandomMatrix(rows, cols, -10, 10, 0.6, 7);
+ writeInputMatrixWithMTD("A", A, true);
+
+ runTest(true, false, null, -1);
+ runRScript(true);
+ //compare matrices
+ HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("O");
+ HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromFS("O");
+ TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/na_locfTest.R b/src/test/scripts/functions/builtin/na_locfTest.R
new file mode 100644
index 0000000..2b19180
--- /dev/null
+++ b/src/test/scripts/functions/builtin/na_locfTest.R
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+library("imputeTS")
+
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+A[A==0] = NA
+
+B = na_locf(A, option = args[2])
+
+
+writeMM(as(B, "CsparseMatrix"), paste(args[3], "O", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/na_locfTest.dml b/src/test/scripts/functions/builtin/na_locfTest.dml
new file mode 100644
index 0000000..f90ce14
--- /dev/null
+++ b/src/test/scripts/functions/builtin/na_locfTest.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+A = read($X);
+# replace zeros with NaN
+dataWithNa = replace(target=A, pattern = 0, replacement = NaN)
+B = na_locf(dataWithNa, $option, FALSE)
+write(B, $O);