You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/08/02 02:31:14 UTC

[2/2] systemml git commit: [SYSTEMML-1824] Fix transformapply by colnames w/ meta data superset

[SYSTEMML-1824] Fix transformapply by colnames w/ meta data superset 

This patch improves the robustness of transformapply. So far we required
that input data and meta data are of equal number of columns. With this
change, we support meta data that is a superset of columns if the
transform specification uses column names (which allows a join by name).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2c9694de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2c9694de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2c9694de

Branch: refs/heads/master
Commit: 2c9694decfe7908cc57f567ee8ff0d279a37d34f
Parents: 61a0931
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Aug 1 19:22:03 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Aug 1 19:30:57 2017 -0700

----------------------------------------------------------------------
 ...ReturnParameterizedBuiltinSPInstruction.java |   2 +-
 .../ParameterizedBuiltinSPInstruction.java      |   2 +-
 .../sysml/runtime/matrix/data/FrameBlock.java   |  12 ++-
 .../transform/encode/EncoderFactory.java        |  24 ++++-
 .../runtime/transform/meta/TfMetaUtils.java     |   8 +-
 .../sysml/runtime/util/DataConverter.java       |   2 +-
 .../TransformFrameEncodeApplySubsetTest.java    | 101 +++++++++++++++++++
 .../TransformFrameEncodeApplySubset.dml         |  32 ++++++
 .../functions/transform/ZPackageSuite.java      |   1 +
 9 files changed, 174 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
index 254c3d7..586d282 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/MultiReturnParameterizedBuiltinSPInstruction.java
@@ -116,7 +116,7 @@ public class MultiReturnParameterizedBuiltinSPInstruction extends ComputationSPI
 			String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
 			MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
 			MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
-			String[] colnames = !TfMetaUtils.isIDSpecification(spec) ?
+			String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
 					in.lookup(1L).get(0).getColumnNames() : null; 
 					
 			//step 1: build transform meta data

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
index e62dd60..01f4512 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ParameterizedBuiltinSPInstruction.java
@@ -438,7 +438,7 @@ public class ParameterizedBuiltinSPInstruction  extends ComputationSPInstruction
 			FrameBlock meta = sec.getFrameInput(params.get("meta"));		
 			MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
 			MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
-			String[] colnames = !TfMetaUtils.isIDSpecification(params.get("spec")) ?
+			String[] colnames = !TfMetaUtils.isIDSpec(params.get("spec")) ?
 					in.lookup(1L).get(0).getColumnNames() : null; 
 			
 			//compute omit offset map for block shifts

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index 5e6404b..10b39fe 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -477,7 +477,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		_numRows = cols[0].length;
 	}
 
-	public Object getColumn(int c) {
+	public Object getColumnData(int c) {
 		switch(_schema[c]) {
 			case STRING:  return ((StringArray)_coldata[c])._data; 
 			case BOOLEAN: return ((BooleanArray)_coldata[c])._data;
@@ -487,6 +487,16 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	 	}
 	}
 	
+	public Array getColumn(int c) {
+		return _coldata[c]; 
+	}
+	
+	public void setColumn(int c, Array column) {
+		if( _coldata == null )
+			_coldata = new Array[getNumColumns()];
+		_coldata[c] = column; 
+	}
+	
 	/**
 	 * Get a row iterator over the frame where all fields are encoded
 	 * as strings independent of their value types.  

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 5e0a178..df506e0 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -45,9 +45,9 @@ public class EncoderFactory
 	}
 
 	@SuppressWarnings("unchecked")
-	public static Encoder createEncoder(String spec,  String[] colnames, ValueType[] schema, FrameBlock meta) 
+	public static Encoder createEncoder(String spec, String[] colnames, ValueType[] schema, FrameBlock meta) 
 		throws DMLRuntimeException 
-	{	
+	{
 		Encoder encoder = null;
 		int clen = schema.length;
 		
@@ -93,10 +93,26 @@ public class EncoderFactory
 			}
 			
 			//create composite decoder of all created encoders
-			//and initialize meta data (recode, dummy, bin, mv)
 			encoder = new EncoderComposite(lencoders);
-			if( meta != null )
+			
+			//initialize meta data w/ robustness for superset of cols
+			if( meta != null ) {
+				String[] colnames2 = meta.getColumnNames();
+				if( !TfMetaUtils.isIDSpec(jSpec) && colnames!=null && colnames2!=null 
+					&& !ArrayUtils.isEquals(colnames, colnames2) ) 
+				{
+					//create temporary meta frame block w/ shallow column copy
+					FrameBlock meta2 = new FrameBlock(meta.getSchema(), colnames2);
+					meta2.setNumRows(meta.getNumRows());
+					for( int i=0; i<colnames.length; i++ ) {
+						int pos = Arrays.binarySearch(colnames2, colnames[i]);
+						meta2.setColumn(i, meta.getColumn(pos));
+						meta2.setColumnMetadata(i, meta.getColumnMetadata(pos));
+					}
+					meta = meta2;
+				}
 				encoder.initMetaData(meta);
+			}
 		}
 		catch(Exception ex) {
 			throw new DMLRuntimeException(ex);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
index afb7ee9..abaad26 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
@@ -50,15 +50,19 @@ import org.apache.wink.json4j.JSONObject;
 
 public class TfMetaUtils 
 {
-	public static boolean isIDSpecification(String spec) throws DMLRuntimeException {
+	public static boolean isIDSpec(String spec) throws DMLRuntimeException {
 		try {
 			JSONObject jSpec = new JSONObject(spec);
-			return jSpec.containsKey("ids") && jSpec.getBoolean("ids");
+			return isIDSpec(jSpec);
 		}
 		catch(JSONException ex) {
 			throw new DMLRuntimeException(ex);
 		}
 	}
+	
+	public static boolean isIDSpec(JSONObject spec) throws JSONException {
+		return spec.containsKey("ids") && spec.getBoolean("ids");
+	}
 
 	public static boolean containsOmitSpec(String spec, String[] colnames) throws DMLRuntimeException {
 		return (TfMetaUtils.parseJsonIDList(spec, colnames, TfUtils.TXMETHOD_OMIT).length > 0);	

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
index ac1e80e..10f043b 100644
--- a/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
+++ b/src/main/java/org/apache/sysml/runtime/util/DataConverter.java
@@ -528,7 +528,7 @@ public class DataConverter
 			double[][] a = new double[n][];
 			double[] c = mb.getDenseBlock();
 			for( int j=0; j<n; j++ )
-				a[j] = (double[])frame.getColumn(j);			
+				a[j] = (double[])frame.getColumnData(j);			
 			int blocksizeIJ = 16; //blocks of a+overhead/c in L1 cache
 			for( int bi=0; bi<m; bi+=blocksizeIJ )
 				for( int bj=0; bj<n; bj+=blocksizeIJ ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
new file mode 100644
index 0000000..b06bf92
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.transform;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase 
+{
+	private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset";
+	private final static String TEST_DIR = "functions/transform/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplySubsetTest.class.getSimpleName() + "/";
+	
+	//dataset and transform tasks without missing values
+	private final static String DATASET1 	= "homes3/homes.csv";
+	
+	@Override
+	public void setUp()  {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) );
+	}
+	
+	@Test
+	public void testHomesRecodeColnamesSingleNodeCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true);
+	}
+	
+	@Test
+	public void testHomesRecodeColnamesSparkCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true);
+	}
+	
+	@Test
+	public void testHomesRecodeColnamesHybridCSV() {
+		runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true);
+	}
+	
+	
+	/**
+	 * 
+	 * @param rt
+	 * @param ofmt
+	 * @param dataset
+	 */
+	private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean colnames)
+	{
+		//set runtime platform
+		RUNTIME_PLATFORM rtold = rtplatform;
+		rtplatform = rt;
+
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		if( !ofmt.equals("csv") )
+			throw new RuntimeException("Unsupported test output format");
+		
+		try
+		{
+			getAndLoadTestConfiguration(TEST_NAME1);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
+			programArgs = new String[]{"-explain", "recompile_hops", "-args", 
+				HOME + "input/" + DATASET1, output("R") };
+	
+			runTest(true, false, null, -1); 
+			
+			//check output 
+			Assert.assertEquals(Double.valueOf(148), 
+				readDMLMatrixFromHDFS("R").get(new CellIndex(1,1)));
+		}
+		finally {
+			rtplatform = rtold;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+		}
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
new file mode 100644
index 0000000..1e55af4
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+F = read($1, data_type="frame", format="csv");
+
+spec = "{ids: false, recode: [ zipcode, district, view ]}";
+[X, M] = transformencode(target=F, spec=spec);
+
+spec2 = "{ids: false, recode: [ district ]}";
+X2 = transformapply(target=F[,2], spec=spec2, meta=M);
+
+R = as.matrix(sum(X[,2]==X2));
+
+write(R, $2);

http://git-wip-us.apache.org/repos/asf/systemml/blob/2c9694de/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
index 645a468..736cfc7 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/transform/ZPackageSuite.java
@@ -30,6 +30,7 @@ import org.junit.runners.Suite;
 	TransformCSVFrameEncodeDecodeTest.class,
 	TransformCSVFrameEncodeReadTest.class,
 	TransformEncodeDecodeTest.class,
+	TransformFrameEncodeApplySubsetTest.class,
 	TransformFrameEncodeApplyTest.class,
 	TransformFrameEncodeDecodeTest.class,
 	TransformFrameEncodeDecodeTokenTest.class,