You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/08/17 05:32:43 UTC

systemml git commit: [SYSTEMML-1846] Fix transformapply w/ subset of column names

Repository: systemml
Updated Branches:
  refs/heads/master 2610a79d2 -> 5b3b990ad


[SYSTEMML-1846] Fix transformapply w/ subset of column names 

This patch fixes special cases of transformapply with a transform
specification based on column names, where the input data has a subset
of columns of the given meta data frame. So far, this join over column
names mistakenly assumed sorted column names and hence failed for
certain scenarios. Hence, this patch also adds additional tests to
better cover these scenarios.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5b3b990a
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5b3b990a
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5b3b990a

Branch: refs/heads/master
Commit: 5b3b990ad283d28dc6b13d166311c787bff7039c
Parents: 2610a79
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Aug 16 22:32:21 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Wed Aug 16 22:32:21 2017 -0700

----------------------------------------------------------------------
 .../transform/encode/EncoderFactory.java        | 15 +++++++-
 .../TransformFrameEncodeApplySubsetTest.java    | 38 ++++++++++++++------
 .../TransformFrameEncodeApplySubset.dml         | 32 -----------------
 .../TransformFrameEncodeApplySubset1.dml        | 32 +++++++++++++++++
 .../TransformFrameEncodeApplySubset2.dml        | 32 +++++++++++++++++
 5 files changed, 106 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index df506e0..8cc22a8 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.transform.encode;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 
 import org.apache.commons.collections.CollectionUtils;
@@ -101,11 +102,16 @@ public class EncoderFactory
 				if( !TfMetaUtils.isIDSpec(jSpec) && colnames!=null && colnames2!=null 
 					&& !ArrayUtils.isEquals(colnames, colnames2) ) 
 				{
+					HashMap<String, Integer> colPos = getColumnPositions(colnames2);
 					//create temporary meta frame block w/ shallow column copy
 					FrameBlock meta2 = new FrameBlock(meta.getSchema(), colnames2);
 					meta2.setNumRows(meta.getNumRows());
 					for( int i=0; i<colnames.length; i++ ) {
-						int pos = Arrays.binarySearch(colnames2, colnames[i]);
+						if( !colPos.containsKey(colnames[i]) ) {
+							throw new DMLRuntimeException("Column name not found in meta data: "
+								+colnames[i]+" (meta: "+Arrays.toString(colnames2)+")");
+						}
+						int pos = colPos.get(colnames[i]);
 						meta2.setColumn(i, meta.getColumn(pos));
 						meta2.setColumnMetadata(i, meta.getColumnMetadata(pos));
 					}
@@ -120,4 +126,11 @@ public class EncoderFactory
 		
 		return encoder;
 	}
+	
+	private static HashMap<String, Integer> getColumnPositions(String[] colnames) {
+		HashMap<String, Integer> ret = new HashMap<>();
+		for(int i=0; i<colnames.length; i++)
+			ret.put(colnames[i], i);
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
index b06bf92..16e1057 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
@@ -30,7 +30,9 @@ import org.apache.sysml.test.utils.TestUtils;
 
 public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase 
 {
-	private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset";
+	private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset1";
+	private final static String TEST_NAME2 = "TransformFrameEncodeApplySubset2";
+	
 	private final static String TEST_DIR = "functions/transform/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplySubsetTest.class.getSimpleName() + "/";
 	
@@ -41,21 +43,37 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase
 	public void setUp()  {
 		TestUtils.clearAssertionInformation();
 		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) );
+		addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "y" }) );
+	}
+	
+	@Test
+	public void testHomesRecodeColnames1SingleNodeCSV() {
+		runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.SINGLE_NODE, "csv", true);
+	}
+	
+	@Test
+	public void testHomesRecodeColnames1SparkCSV() {
+		runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.SPARK, "csv", true);
+	}
+	
+	@Test
+	public void testHomesRecodeColnames1HybridCSV() {
+		runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true);
 	}
 	
 	@Test
-	public void testHomesRecodeColnamesSingleNodeCSV() {
-		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true);
+	public void testHomesRecodeColnames2SingleNodeCSV() {
+		runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.SINGLE_NODE, "csv", true);
 	}
 	
 	@Test
-	public void testHomesRecodeColnamesSparkCSV() {
-		runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true);
+	public void testHomesRecodeColnames2SparkCSV() {
+		runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.SPARK, "csv", true);
 	}
 	
 	@Test
-	public void testHomesRecodeColnamesHybridCSV() {
-		runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true);
+	public void testHomesRecodeColnames2HybridCSV() {
+		runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true);
 	}
 	
 	
@@ -65,7 +83,7 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase
 	 * @param ofmt
 	 * @param dataset
 	 */
-	private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean colnames)
+	private void runTransformTest(String testname, RUNTIME_PLATFORM rt, String ofmt, boolean colnames)
 	{
 		//set runtime platform
 		RUNTIME_PLATFORM rtold = rtplatform;
@@ -80,10 +98,10 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase
 		
 		try
 		{
-			getAndLoadTestConfiguration(TEST_NAME1);
+			getAndLoadTestConfiguration(testname);
 			
 			String HOME = SCRIPT_DIR + TEST_DIR;
-			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
+			fullDMLScriptName = HOME + testname + ".dml";
 			programArgs = new String[]{"-explain", "recompile_hops", "-args", 
 				HOME + "input/" + DATASET1, output("R") };
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
deleted file mode 100644
index 1e55af4..0000000
--- a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
+++ /dev/null
@@ -1,32 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-F = read($1, data_type="frame", format="csv");
-
-spec = "{ids: false, recode: [ zipcode, district, view ]}";
-[X, M] = transformencode(target=F, spec=spec);
-
-spec2 = "{ids: false, recode: [ district ]}";
-X2 = transformapply(target=F[,2], spec=spec2, meta=M);
-
-R = as.matrix(sum(X[,2]==X2));
-
-write(R, $2);

http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml
new file mode 100644
index 0000000..1e55af4
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+F = read($1, data_type="frame", format="csv");
+
+spec = "{ids: false, recode: [ zipcode, district, view ]}";
+[X, M] = transformencode(target=F, spec=spec);
+
+spec2 = "{ids: false, recode: [ district ]}";
+X2 = transformapply(target=F[,2], spec=spec2, meta=M);
+
+R = as.matrix(sum(X[,2]==X2));
+
+write(R, $2);

http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml
new file mode 100644
index 0000000..d586e11
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+F = read($1, data_type="frame", format="csv");
+
+spec = "{ids: false, recode: [ zipcode, district, view ]}";
+[X, M] = transformencode(target=F, spec=spec);
+
+spec2 = "{ids: false, recode: [ zipcode ]}";
+X2 = transformapply(target=F[,1], spec=spec2, meta=M);
+
+R = as.matrix(sum(X[,1]==X2));
+
+write(R, $2);