You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/12/27 20:22:37 UTC
[systemds] branch main updated: [SYSTEMDS-3261] Extended min-max normalization built-in functions
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new bd68831 [SYSTEMDS-3261] Extended min-max normalization built-in functions
bd68831 is described below
commit bd688311b262bf759c6efa68999fae13d6126a7d
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Mon Dec 27 21:19:22 2021 +0100
[SYSTEMDS-3261] Extended min-max normalization built-in functions
This patch adds a normalizeApply function, documentation, and
extended tests for min-max normalization (which is necessary for our
TPCx-AI implementation).
---
scripts/builtin/normalize.dml | 25 +++++++++++--
scripts/builtin/normalizeApply.dml | 43 ++++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../builtin/part2/BuiltinNormalizeTest.java | 43 +++++++++++++++++++---
.../scripts/functions/builtin/normalizeAll.dml | 9 +++--
5 files changed, 108 insertions(+), 13 deletions(-)
diff --git a/scripts/builtin/normalize.dml b/scripts/builtin/normalize.dml
index ac29667..e2a32be 100644
--- a/scripts/builtin/normalize.dml
+++ b/scripts/builtin/normalize.dml
@@ -19,7 +19,26 @@
#
#-------------------------------------------------------------
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
- # normalize features to range [0,1]
- Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices
+# of positive values, this normalization preserves the input sparsity.
+#
+# ------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ------------------------------------------------------------------------------
+# X Matrix --- Input feature matrix of shape n-by-m
+# ------------------------------------------------------------------------------
+# Y Matrix --- Modified output feature matrix of shape n-by-m
+# cmin Matrix --- Colunm minima of shape 1-by-m
+# cmax Matrix --- Column maxima of shape 1-by-m
+# ------------------------------------------------------------------------------
+
+
+m_normalize = function(Matrix[Double] X)
+ return (Matrix[Double] Y, Matrix[Double] cmin, Matrix[Double] cmax)
+{
+ # compute feature ranges for transformations
+ cmin = colMins(X);
+ cmax = colMaxs(X);
+ # normalize features to range [0,1]
+ Y = normalizeApply(X, cmin, cmax);
}
diff --git a/scripts/builtin/normalizeApply.dml b/scripts/builtin/normalizeApply.dml
new file mode 100644
index 0000000..07fad33
--- /dev/null
+++ b/scripts/builtin/normalizeApply.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1], given
+# existing min-max ranges. For matrices of positive values, this normalization
+# preserves the input sparsity. The validity of the provided min-max range
+# and post-processing is under control of the caller.
+#
+# ------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ------------------------------------------------------------------------------
+# X Matrix --- Input feature matrix of shape n-by-m
+# cmin Matrix --- Colunm minima of shape 1-by-m
+# cmax Matrix --- Column maxima of shape 1-by-m
+# ------------------------------------------------------------------------------
+# Y Matrix --- Modified output feature matrix of shape n-by-m
+# ------------------------------------------------------------------------------
+
+
+m_normalizeApply = function(Matrix[Double] X, Matrix[Double] cmin, Matrix[Double] cmax)
+ return (Matrix[Double] Y)
+{
+ # normalize features to given range ([0,1] if indeed min/max)
+ Y = (X - cmin) / (cmax - cmin);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index f44cf42..58fced1 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -205,6 +205,7 @@ public enum Builtins {
NAIVEBAYESPREDICT("naiveBayesPredict", true, false),
NCOL("ncol", false),
NORMALIZE("normalize", true),
+ NORMALIZEAPPLY("normalizeApply", true),
NROW("nrow", false),
OUTER("outer", false),
OUTLIER("outlier", true, false), //TODO parameterize opposite
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
index f87f2ce..6bc7028 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
@@ -21,17 +21,22 @@ package org.apache.sysds.test.functions.builtin.part2;
import java.util.HashMap;
+import org.junit.Assert;
import org.junit.Test;
+
import org.apache.sysds.common.Types.ExecMode;
import org.apache.sysds.common.Types.ExecType;
import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
+import org.apache.sysds.utils.Statistics;
public class BuiltinNormalizeTest extends AutomatedTestBase
{
private final static String TEST_NAME = "normalize";
+ private final static String TEST_NAME2 = "normalizeAll";
+
private final static String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinNormalizeTest.class.getSimpleName() + "/";
@@ -48,25 +53,45 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
@Test
public void testNormalizeMatrixDenseCP() {
- runNormalizeTest(false, false, ExecType.CP);
+ runNormalizeTest(TEST_NAME, false, ExecType.CP);
}
@Test
public void testNormalizeMatrixSparseCP() {
- runNormalizeTest(false, true, ExecType.CP);
+ runNormalizeTest(TEST_NAME, true, ExecType.CP);
}
@Test
public void testNormalizeMatrixDenseSP() {
- runNormalizeTest(false, false, ExecType.SPARK);
+ runNormalizeTest(TEST_NAME, false, ExecType.SPARK);
}
@Test
public void testNormalizeMatrixSparseSP() {
- runNormalizeTest(false, true, ExecType.SPARK);
+ runNormalizeTest(TEST_NAME, true, ExecType.SPARK);
+ }
+
+ @Test
+ public void testNormalize2MatrixDenseCP() {
+ runNormalizeTest(TEST_NAME2, false, ExecType.CP);
+ }
+
+ @Test
+ public void testNormalize2MatrixSparseCP() {
+ runNormalizeTest(TEST_NAME2, true, ExecType.CP);
}
- private void runNormalizeTest(boolean scalar, boolean sparse, ExecType instType)
+ @Test
+ public void testNormalize2MatrixDenseSP() {
+ runNormalizeTest(TEST_NAME2, false, ExecType.SPARK);
+ }
+
+ @Test
+ public void testNormalize2MatrixSparseSP() {
+ runNormalizeTest(TEST_NAME2, true, ExecType.SPARK);
+ }
+
+ private void runNormalizeTest(String testname, boolean sparse, ExecType instType)
{
ExecMode platformOld = setExecMode(instType);
@@ -76,7 +101,7 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
double sparsity = sparse ? spSparse : spDense;
String HOME = SCRIPT_DIR + TEST_DIR;
- fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ fullDMLScriptName = HOME + testname + ".dml";
programArgs = new String[]{"-args", input("A"), output("B") };
fullRScriptName = HOME + TEST_NAME + ".R";
rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir();
@@ -92,6 +117,12 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("B");
HashMap<CellIndex, Double> rfile = readRMatrixFromExpectedDir("B");
TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+
+ //check number of compiler Spark instructions
+ if( instType == ExecType.CP ) {
+ Assert.assertEquals(1, Statistics.getNoOfCompiledSPInst()); //reblock
+ Assert.assertEquals(0, Statistics.getNoOfExecutedSPInst());
+ }
}
finally {
rtplatform = platformOld;
diff --git a/scripts/builtin/normalize.dml b/src/test/scripts/functions/builtin/normalizeAll.dml
similarity index 85%
copy from scripts/builtin/normalize.dml
copy to src/test/scripts/functions/builtin/normalizeAll.dml
index ac29667..a1c7527 100644
--- a/scripts/builtin/normalize.dml
+++ b/src/test/scripts/functions/builtin/normalizeAll.dml
@@ -19,7 +19,8 @@
#
#-------------------------------------------------------------
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
- # normalize features to range [0,1]
- Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
-}
+X = read($1);
+[Y, mins, maxs] = normalize(X);
+Y = normalizeApply(X, mins, maxs);
+
+write(Y, $2);