You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2020/09/06 15:30:41 UTC

[systemds] branch master updated: [SYSTEMDS-2635] New builtin for forward and backward NA filling

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new f4e2412  [SYSTEMDS-2635] New builtin for forward and backward NA filling
f4e2412 is described below

commit f4e2412bf15abdda41bf0d0099cc1d0a2c40c1f1
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Sun Sep 6 17:29:51 2020 +0200

    [SYSTEMDS-2635] New builtin for forward and backward NA filling
    
    Closes #1036.
---
 scripts/builtin/na_locf.dml                        |  80 ++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../sysds/runtime/compress/BitmapEncoder.java      |   4 +-
 .../runtime/compress/CompressedMatrixBlock.java    |   2 +-
 .../test/functions/builtin/BuiltinNaLocfTest.java  | 119 +++++++++++++++++++++
 src/test/scripts/functions/builtin/na_locfTest.R   |  34 ++++++
 src/test/scripts/functions/builtin/na_locfTest.dml |  26 +++++
 7 files changed, 263 insertions(+), 3 deletions(-)

diff --git a/scripts/builtin/na_locf.dml b/scripts/builtin/na_locf.dml
new file mode 100644
index 0000000..d211501
--- /dev/null
+++ b/scripts/builtin/na_locf.dml
@@ -0,0 +1,80 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Builtin function for imputing missing values using forward fill and backward fill techniques
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME            TYPE      DEFAULT   MEANING
+# ---------------------------------------------------------------------------------------------
+# X               Double    ---       Matrix X
+# option          String    "locf"    String "locf" (last observation moved forward) to do forward fill 
+#                                     String "nocb" (next observation carried backward) to do backward fill
+# verbose         Boolean   FALSE     to print output on screen
+# ---------------------------------------------------------------------------------------------
+
+#Output(s)
+# ---------------------------------------------------------------------------------------------
+# NAME            TYPE    DEFAULT     MEANING
+# ---------------------------------------------------------------------------------------------
+# output          Double  ---         Matrix with no missing values
+
+m_na_locf = function(Matrix[Double] X, String option = "locf", Boolean verbose = FALSE)
+  return(Matrix[Double] output)
+{
+  output = X
+  if(sum(is.nan(X)) > 0) {
+    if(option == "locf")
+      output = locf(X)
+    else
+      output = rev(locf(rev(X)))
+  }
+  
+  if(verbose)
+    print(toString(output))
+}
+
+locf = function(Matrix[Double] X)
+  return(Matrix[Double] outputLocf)
+{
+  # store mask of missing values
+  mask = is.nan(X)
+
+  # replace NaN with a number i.e., zeros
+  X = replace(target=X, pattern = NaN, replacement = 0)
+
+  # use the cumsumprod built-in to do fill forward
+  output = matrix(0, nrow(X), ncol(X))
+  parfor(i in 1:ncol(X)) 
+    output[ ,i] = cumsumprod(cbind(X[,i],mask[,i]))
+
+  # if there are leading NAs
+  leading_NA = (output == 0) & (mask == 1)
+  outputR = matrix(0, nrow(X), ncol(X))
+
+  if(sum(leading_NA) > 0) {
+    # doing fill forward in reverse
+    parfor(i in 1:ncol(X))
+      outputR[ ,i] = rev(cumsumprod(rev(cbind(X[,i],leading_NA[,i]))))
+  }
+
+  outputLocf = (outputR * leading_NA) + output
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 28acc5b..9ff8f78 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -140,6 +140,7 @@ public enum Builtins {
 	MSVMPREDICT("msvmPredict", true),
 	MULTILOGREG("multiLogReg", true),
 	MULTILOGREGPREDICT("multiLogRegPredict", true),
+	NA_LOCF("na_locf", true),
 	NCOL("ncol", false),
 	NORMALIZE("normalize", true),
 	NROW("nrow", false),
diff --git a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
index e9be28a..6683d64 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
@@ -289,7 +289,7 @@ public class BitmapEncoder {
 					lengths.put(scaledValues[idx], lengths.get(scaledValues[idx]) + fullSizeOffsetsLists[idx].size());
 				}
 				else {
-					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+					Queue<IntArrayList> offsets = new LinkedList<>();
 					offsets.add(fullSizeOffsetsLists[idx]);
 					values.put(scaledValues[idx], offsets);
 					lengths.put(scaledValues[idx], fullSizeOffsetsLists[idx].size());
@@ -347,7 +347,7 @@ public class BitmapEncoder {
 					lengths.put(array, lengths.get(array) + fullSizeOffsetsLists[idx / numColumns].size());
 				}
 				else {
-					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+					Queue<IntArrayList> offsets = new LinkedList<>();
 					offsets.add(fullSizeOffsetsLists[idx / numColumns]);
 					values.put(array, offsets);
 					lengths.put(array, fullSizeOffsetsLists[idx / numColumns].size());
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index db2b8ba..aaf75901 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -1638,7 +1638,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 
 		@Override
 		public List<ColGroup> call() {
-			List<ColGroup> res = new ArrayList<ColGroup>();
+			List<ColGroup> res = new ArrayList<>();
 			for(ColGroup x : _colGroups) {
 				res.add(x.scalarOperation(_sop));
 			}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java
new file mode 100644
index 0000000..8f5b1e1
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinNaLocfTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinNaLocfTest extends AutomatedTestBase {
+	private final static String TEST_NAME = "na_locfTest";
+	private final static String TEST_DIR = "functions/builtin/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinNaLocfTest.class.getSimpleName() + "/";
+
+	private final static double eps = 1e-10;
+	private final static int rows = 25;
+	private final static int cols = 25;
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"O"}));
+	}
+
+	@Test
+	public void tesLocfNoLineageCP() {
+		runLocfTest(false, "locf", LopProperties.ExecType.CP);
+	}
+
+	@Test
+	public void tesLocfLineageCP() {
+		runLocfTest(true, "locf", LopProperties.ExecType.CP);
+	}
+
+	@Test
+	public void tesLocfNoLineageSPARK() {
+		runLocfTest(false,"locf",  LopProperties.ExecType.SPARK);
+	}
+
+	@Test
+	public void tesLocfLineageSPARK() {
+		runLocfTest(true,"locf",  LopProperties.ExecType.SPARK);
+	}
+
+	@Test
+	public void tesnocbNoLineageCP() {
+		runLocfTest(false, "nocb", LopProperties.ExecType.CP);
+	}
+
+	@Test
+	public void tesnocbLineageCP() {
+		runLocfTest(true, "nocb", LopProperties.ExecType.CP);
+	}
+
+	@Test
+	public void tesnocbNoLineageSPARK() {
+		runLocfTest(false,"nocb",  LopProperties.ExecType.SPARK);
+	}
+
+	@Test
+	public void tesnocbLineageSPARK() {
+		runLocfTest(true,"nocb",  LopProperties.ExecType.SPARK);
+	}
+
+	private void runLocfTest(boolean lineage, String option, LopProperties.ExecType instType) {
+		Types.ExecMode platformOld = setExecMode(instType);
+		try {
+			setOutputBuffering(false);
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[] {"-nvargs", "X=" + input("A"), "option="+option, "O=" + output("O")};
+			if(lineage) {
+				String[] lin = new String[] {"-stats", "-lineage", ReuseCacheType.REUSE_HYBRID.name().toLowerCase()};
+				programArgs = (String[]) ArrayUtils.addAll(programArgs, lin);
+			}
+
+			fullRScriptName = HOME + TEST_NAME + ".R";
+			rCmd = getRCmd(inputDir(), option, expectedDir());
+
+			//generate actual dataset
+			double[][] A = getRandomMatrix(rows, cols, -10, 10, 0.6, 7);
+			writeInputMatrixWithMTD("A", A, true);
+
+			runTest(true, false, null, -1);
+			runRScript(true);
+			//compare matrices
+			HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("O");
+			HashMap<MatrixValue.CellIndex, Double> rfile = readRMatrixFromFS("O");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+		}
+		finally {
+			rtplatform = platformOld;
+		}
+	}
+}
diff --git a/src/test/scripts/functions/builtin/na_locfTest.R b/src/test/scripts/functions/builtin/na_locfTest.R
new file mode 100644
index 0000000..2b19180
--- /dev/null
+++ b/src/test/scripts/functions/builtin/na_locfTest.R
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+library("imputeTS")
+
+A = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+A[A==0] = NA
+
+B = na_locf(A, option = args[2])
+
+
+writeMM(as(B, "CsparseMatrix"), paste(args[3], "O", sep=""));
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/na_locfTest.dml b/src/test/scripts/functions/builtin/na_locfTest.dml
new file mode 100644
index 0000000..f90ce14
--- /dev/null
+++ b/src/test/scripts/functions/builtin/na_locfTest.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+A = read($X);
+# replace zeros with NaN
+dataWithNa = replace(target=A, pattern = 0, replacement = NaN)
+B = na_locf(dataWithNa, $option, FALSE)
+write(B, $O);