You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/12/27 20:22:37 UTC

[systemds] branch main updated: [SYSTEMDS-3261] Extended min-max normalization built-in functions

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new bd68831  [SYSTEMDS-3261] Extended min-max normalization built-in functions
bd68831 is described below

commit bd688311b262bf759c6efa68999fae13d6126a7d
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Mon Dec 27 21:19:22 2021 +0100

    [SYSTEMDS-3261] Extended min-max normalization built-in functions
    
    This patch adds a normalizeApply function, documentation, and
    extended tests for min-max normalization (which is necessary for our
    TPCx-AI implementation).
---
 scripts/builtin/normalize.dml                      | 25 +++++++++++--
 scripts/builtin/normalizeApply.dml                 | 43 ++++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |  1 +
 .../builtin/part2/BuiltinNormalizeTest.java        | 43 +++++++++++++++++++---
 .../scripts/functions/builtin/normalizeAll.dml     |  9 +++--
 5 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/scripts/builtin/normalize.dml b/scripts/builtin/normalize.dml
index ac29667..e2a32be 100644
--- a/scripts/builtin/normalize.dml
+++ b/scripts/builtin/normalize.dml
@@ -19,7 +19,26 @@
 #
 #-------------------------------------------------------------
 
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
-  # normalize features to range [0,1]
-  Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices 
+# of positive values, this normalization preserves the input sparsity.
+#
+# ------------------------------------------------------------------------------
+# NAME     TYPE     DEFAULT   MEANING
+# ------------------------------------------------------------------------------
+# X        Matrix    ---      Input feature matrix of shape n-by-m
+# ------------------------------------------------------------------------------
+# Y        Matrix    ---      Modified output feature matrix of shape n-by-m
+# cmin     Matrix    ---      Colunm minima of shape 1-by-m
+# cmax     Matrix    ---      Column maxima of shape 1-by-m
+# ------------------------------------------------------------------------------
+
+
+m_normalize = function(Matrix[Double] X)
+  return (Matrix[Double] Y, Matrix[Double] cmin, Matrix[Double] cmax)
+{
+  # compute feature ranges for transformations
+  cmin = colMins(X);
+  cmax = colMaxs(X);
+	# normalize features to range [0,1]
+  Y = normalizeApply(X, cmin, cmax);
 }
diff --git a/scripts/builtin/normalizeApply.dml b/scripts/builtin/normalizeApply.dml
new file mode 100644
index 0000000..07fad33
--- /dev/null
+++ b/scripts/builtin/normalizeApply.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1], given 
+# existing min-max ranges. For matrices of positive values, this normalization 
+# preserves the input sparsity. The validity of the provided min-max range
+# and post-processing is under control of the caller. 
+#
+# ------------------------------------------------------------------------------
+# NAME     TYPE     DEFAULT   MEANING
+# ------------------------------------------------------------------------------
+# X        Matrix    ---      Input feature matrix of shape n-by-m
+# cmin     Matrix    ---      Colunm minima of shape 1-by-m
+# cmax     Matrix    ---      Column maxima of shape 1-by-m
+# ------------------------------------------------------------------------------
+# Y        Matrix    ---      Modified output feature matrix of shape n-by-m
+# ------------------------------------------------------------------------------
+
+
+m_normalizeApply = function(Matrix[Double] X, Matrix[Double] cmin, Matrix[Double] cmax)
+  return (Matrix[Double] Y)
+{
+  # normalize features to given range ([0,1] if indeed min/max)
+  Y = (X - cmin) / (cmax - cmin);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index f44cf42..58fced1 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -205,6 +205,7 @@ public enum Builtins {
 	NAIVEBAYESPREDICT("naiveBayesPredict", true, false),
 	NCOL("ncol", false),
 	NORMALIZE("normalize", true),
+	NORMALIZEAPPLY("normalizeApply", true),
 	NROW("nrow", false),
 	OUTER("outer", false),
 	OUTLIER("outlier", true, false), //TODO parameterize opposite
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
index f87f2ce..6bc7028 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
@@ -21,17 +21,22 @@ package org.apache.sysds.test.functions.builtin.part2;
 
 import java.util.HashMap;
 
+import org.junit.Assert;
 import org.junit.Test;
+
 import org.apache.sysds.common.Types.ExecMode;
 import org.apache.sysds.common.Types.ExecType;
 import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
+import org.apache.sysds.utils.Statistics;
 
 public class BuiltinNormalizeTest extends AutomatedTestBase 
 {
 	private final static String TEST_NAME = "normalize";
+	private final static String TEST_NAME2 = "normalizeAll";
+	
 	private final static String TEST_DIR = "functions/builtin/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinNormalizeTest.class.getSimpleName() + "/";
 	
@@ -48,25 +53,45 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
 
 	@Test
 	public void testNormalizeMatrixDenseCP() {
-		runNormalizeTest(false, false, ExecType.CP);
+		runNormalizeTest(TEST_NAME, false, ExecType.CP);
 	}
 	
 	@Test
 	public void testNormalizeMatrixSparseCP() {
-		runNormalizeTest(false, true, ExecType.CP);
+		runNormalizeTest(TEST_NAME, true, ExecType.CP);
 	}
 	
 	@Test
 	public void testNormalizeMatrixDenseSP() {
-		runNormalizeTest(false, false, ExecType.SPARK);
+		runNormalizeTest(TEST_NAME, false, ExecType.SPARK);
 	}
 	
 	@Test
 	public void testNormalizeMatrixSparseSP() {
-		runNormalizeTest(false, true, ExecType.SPARK);
+		runNormalizeTest(TEST_NAME, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testNormalize2MatrixDenseCP() {
+		runNormalizeTest(TEST_NAME2, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testNormalize2MatrixSparseCP() {
+		runNormalizeTest(TEST_NAME2, true, ExecType.CP);
 	}
 	
-	private void runNormalizeTest(boolean scalar, boolean sparse, ExecType instType)
+	@Test
+	public void testNormalize2MatrixDenseSP() {
+		runNormalizeTest(TEST_NAME2, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testNormalize2MatrixSparseSP() {
+		runNormalizeTest(TEST_NAME2, true, ExecType.SPARK);
+	}
+	
+	private void runNormalizeTest(String testname, boolean sparse, ExecType instType)
 	{
 		ExecMode platformOld = setExecMode(instType);
 		
@@ -76,7 +101,7 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
 			double sparsity = sparse ? spSparse : spDense;
 			
 			String HOME = SCRIPT_DIR + TEST_DIR;
-			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			fullDMLScriptName = HOME + testname + ".dml";
 			programArgs = new String[]{"-args", input("A"), output("B") };
 			fullRScriptName = HOME + TEST_NAME + ".R";
 			rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir();
@@ -92,6 +117,12 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
 			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("B");
 			HashMap<CellIndex, Double> rfile  = readRMatrixFromExpectedDir("B");
 			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+		
+			//check number of compiler Spark instructions
+			if( instType == ExecType.CP ) {
+				Assert.assertEquals(1, Statistics.getNoOfCompiledSPInst()); //reblock
+				Assert.assertEquals(0, Statistics.getNoOfExecutedSPInst());
+			}
 		}
 		finally {
 			rtplatform = platformOld;
diff --git a/scripts/builtin/normalize.dml b/src/test/scripts/functions/builtin/normalizeAll.dml
similarity index 85%
copy from scripts/builtin/normalize.dml
copy to src/test/scripts/functions/builtin/normalizeAll.dml
index ac29667..a1c7527 100644
--- a/scripts/builtin/normalize.dml
+++ b/src/test/scripts/functions/builtin/normalizeAll.dml
@@ -19,7 +19,8 @@
 #
 #-------------------------------------------------------------
 
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
-  # normalize features to range [0,1]
-  Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
-}
+X = read($1);
+[Y, mins, maxs] = normalize(X);
+Y = normalizeApply(X, mins, maxs);
+
+write(Y, $2);