You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/09/12 01:32:12 UTC

systemml git commit: [SYSTEMML-1902] Fix transformencode w/ mix of recode/dummycode columns

Repository: systemml
Updated Branches:
  refs/heads/master cddd2a4f6 -> 816a900b5


[SYSTEMML-1902] Fix transformencode w/ mix of recode/dummycode columns

This patch fixes an issue of transformencode for scenarios where the
transform specification contains a mix of recoded and dummy coded
columns. Specifically, recoded columns that where not in the set of
dummy coded columns consumed their values from the original input, if
there was at least one dummy coded column.

Furthermore, this patch adds related test cases, a better error
handling, and improves the performance of dummy coding (and subsequent
operations) by creating the output in sparse format whenever applicable.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/816a900b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/816a900b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/816a900b

Branch: refs/heads/master
Commit: 816a900b5be4979a2cc11a326feddc709a4177ef
Parents: cddd2a4
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Sep 11 18:32:00 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Sep 11 18:32:12 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/transform/encode/Encoder.java |  4 ++
 .../transform/encode/EncoderComposite.java      | 59 +++++++++++++-----
 .../transform/encode/EncoderDummycode.java      | 34 ++++++-----
 .../runtime/transform/encode/EncoderRecode.java |  6 +-
 .../TransformFrameEncodeApplyTest.java          | 64 ++++++++++++++------
 .../input/homes3/homes.tfspec_recode_dummy.json |  2 +
 .../homes3/homes.tfspec_recode_dummy2.json      |  3 +
 7 files changed, 119 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
index e4af8a6..4117c67 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
@@ -21,6 +21,9 @@ package org.apache.sysml.runtime.transform.encode;
 
 import java.io.Serializable;
 import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.util.UtilFunctions;
@@ -34,6 +37,7 @@ import org.apache.wink.json4j.JSONArray;
 public abstract class Encoder implements Serializable
 {
 	private static final long serialVersionUID = 2299156350718979064L;
+	protected static final Log LOG = LogFactory.getLog(Encoder.class.getName());
 	
 	protected int _clen = -1; 
 	protected int[] _colList = null;

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
index ffff1df..c04a011 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.runtime.transform.encode;
 
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.sysml.parser.Expression.ValueType;
@@ -57,21 +58,27 @@ public class EncoderComposite extends Encoder
 	
 	@Override
 	public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
-		//build meta data first (for all encoders)
-		for( Encoder encoder : _encoders )
-			encoder.build(in);
-		
-		//propagate meta data 
-		_meta = new FrameBlock(in.getNumColumns(), ValueType.STRING);
-		for( Encoder encoder : _encoders )
-			_meta = encoder.getMetaData(_meta);
-		for( Encoder encoder : _encoders )
-			encoder.initMetaData(_meta);
-		
-		//apply meta data
-		for( Encoder encoder : _encoders )
-			out = encoder.apply(in, out);
+		try {
+			//build meta data first (for all encoders)
+			for( Encoder encoder : _encoders )
+				encoder.build(in);
 			
+			//propagate meta data 
+			_meta = new FrameBlock(in.getNumColumns(), ValueType.STRING);
+			for( Encoder encoder : _encoders )
+				_meta = encoder.getMetaData(_meta);
+			for( Encoder encoder : _encoders )
+				encoder.initMetaData(_meta);
+			
+			//apply meta data
+			for( Encoder encoder : _encoders )
+				out = encoder.apply(in, out);
+		}
+		catch(Exception ex) {
+			LOG.error("Failed transform-encode frame with \n" + this);
+			throw ex;
+		}
+		
 		return out;
 	}
 
@@ -83,8 +90,14 @@ public class EncoderComposite extends Encoder
 	
 	@Override 
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
-		for( Encoder encoder : _encoders )
-			out = encoder.apply(in, out);
+		try {
+			for( Encoder encoder : _encoders )
+				out = encoder.apply(in, out);
+		}
+		catch(Exception ex) {
+			LOG.error("Failed to transform-apply frame with \n" + this);
+			throw ex;
+		}
 		return out;
 	}
 	
@@ -102,4 +115,18 @@ public class EncoderComposite extends Encoder
 		for( Encoder encoder : _encoders )
 			encoder.initMetaData(out);
 	}
+	
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("CompositeEncoder("+_encoders.size()+"):\n");
+		for( Encoder encoder : _encoders ) {
+			sb.append("-- ");
+			sb.append(encoder.getClass().getSimpleName());
+			sb.append(": ");
+			sb.append(Arrays.toString(encoder.getColList()));
+			sb.append("\n");
+		}
+		return sb.toString();
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
index 9a2f059..9d7a5e9 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
@@ -23,16 +23,15 @@ import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.transform.TfUtils;
 import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
-import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.wink.json4j.JSONException;
 import org.apache.wink.json4j.JSONObject;
 
 public class EncoderDummycode extends Encoder 
-{		
+{
 	private static final long serialVersionUID = 5832130477659116489L;
 
-	private int[] _domainSizes = null;			// length = #of dummycoded columns
-	private long _dummycodedLength = 0;			// #of columns after dummycoded
+	private int[] _domainSizes = null;  // length = #of dummycoded columns
+	private long _dummycodedLength = 0; // #of columns after dummycoded
 
 	public EncoderDummycode(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException {
 		super(null, clen);
@@ -59,26 +58,29 @@ public class EncoderDummycode extends Encoder
 	}
 	
 	@Override
-	public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
-	{
-		MatrixBlock ret = new MatrixBlock(out.getNumRows(), (int)_dummycodedLength, false);
+	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
+		//allocate output in dense or sparse representation
+		final boolean sparse = MatrixBlock.evalSparseFormatInMemory(
+			out.getNumRows(), getNumCols(), out.getNonZeros());
+		MatrixBlock ret = new MatrixBlock(out.getNumRows(), getNumCols(), sparse);
 		
+		//append dummy coded or unchanged values to output
+		final int clen = out.getNumColumns();
 		for( int i=0; i<out.getNumRows(); i++ ) {
-			for(int colID=1, idx=0, ncolID=1; colID <= out.getNumColumns(); colID++) {
+			for(int colID=1, idx=0, ncolID=1; colID <= clen; colID++) {
 				double val = out.quickGetValue(i, colID-1);
-				if(idx < _colList.length && colID==_colList[idx]) {
-					ret.quickSetValue(i, ncolID-1+(int)val-1, 1);
+				if( idx < _colList.length && colID==_colList[idx] ) {
+					ret.appendValue(i, ncolID-1+(int)val-1, 1);
 					ncolID += _domainSizes[idx];
-					idx++;
+					idx ++;
 				}
 				else {
-					double ptval = UtilFunctions.objectToDouble(in.getSchema()[colID-1], in.get(i, colID-1));
-					ret.quickSetValue(i, ncolID-1, ptval);
-					ncolID++;
+					double ptval = out.quickGetValue(i, colID-1);
+					ret.appendValue(i, ncolID-1, ptval);
+					ncolID ++;
 				}
 			}
 		}
-		
 		return ret;
 	}
 
@@ -95,7 +97,7 @@ public class EncoderDummycode extends Encoder
 		for( int j=0; j<_colList.length; j++ ) {
 			int colID = _colList[j]; //1-based
 			_domainSizes[j] = (int)meta.getColumnMetadata()[colID-1].getNumDistinct();
-			_dummycodedLength +=  _domainSizes[j]-1;
+			_dummycodedLength += _domainSizes[j]-1;
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index 2a7e405..d090e66 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -33,9 +33,9 @@ import org.apache.wink.json4j.JSONException;
 import org.apache.wink.json4j.JSONObject;
 
 public class EncoderRecode extends Encoder 
-{	
+{
 	private static final long serialVersionUID = 8213163881283341874L;
-
+	
 	//recode maps and custom map for partial recode maps 
 	private HashMap<Integer, HashMap<String, Long>> _rcdMaps  = new HashMap<Integer, HashMap<String, Long>>();
 	private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null;
@@ -80,7 +80,7 @@ public class EncoderRecode extends Encoder
 	@Override
 	public void build(FrameBlock in) {
 		if( !isApplicable() )
-			return;		
+			return;
 
 		Iterator<String[]> iter = in.getStringRowIterator(_colList);
 		while( iter.hasNext() ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
index 405661b..5e1350b 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
@@ -38,24 +38,27 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 	private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplyTest.class.getSimpleName() + "/";
 	
 	//dataset and transform tasks without missing values
-	private final static String DATASET1 	= "homes3/homes.csv";
-	private final static String SPEC1 		= "homes3/homes.tfspec_recode.json"; 
-	private final static String SPEC1b 		= "homes3/homes.tfspec_recode2.json"; 
-	private final static String SPEC2 		= "homes3/homes.tfspec_dummy.json";
-	private final static String SPEC2b 		= "homes3/homes.tfspec_dummy2.json";
-	private final static String SPEC3 		= "homes3/homes.tfspec_bin.json"; //incl recode
-	private final static String SPEC3b 		= "homes3/homes.tfspec_bin2.json"; //incl recode
+	private final static String DATASET1 = "homes3/homes.csv";
+	private final static String SPEC1    = "homes3/homes.tfspec_recode.json"; 
+	private final static String SPEC1b   = "homes3/homes.tfspec_recode2.json"; 
+	private final static String SPEC2    = "homes3/homes.tfspec_dummy.json";
+	private final static String SPEC2b   = "homes3/homes.tfspec_dummy2.json";
+	private final static String SPEC3    = "homes3/homes.tfspec_bin.json"; //incl recode
+	private final static String SPEC3b   = "homes3/homes.tfspec_bin2.json"; //incl recode
+	private final static String SPEC6    = "homes3/homes.tfspec_recode_dummy.json"; 
+	private final static String SPEC6b   = "homes3/homes.tfspec_recode_dummy2.json"; 
 	
 	//dataset and transform tasks with missing values
-	private final static String DATASET2 	= "homes/homes.csv";
-	private final static String SPEC4 		= "homes3/homes.tfspec_impute.json";
-	private final static String SPEC4b 		= "homes3/homes.tfspec_impute2.json";
-	private final static String SPEC5 		= "homes3/homes.tfspec_omit.json";
-	private final static String SPEC5b 		= "homes3/homes.tfspec_omit2.json";
+	private final static String DATASET2 = "homes/homes.csv";
+	private final static String SPEC4    = "homes3/homes.tfspec_impute.json";
+	private final static String SPEC4b   = "homes3/homes.tfspec_impute2.json";
+	private final static String SPEC5    = "homes3/homes.tfspec_omit.json";
+	private final static String SPEC5b   = "homes3/homes.tfspec_omit2.json";
 	
 	public enum TransformType {
 		RECODE,
 		DUMMY,
+		RECODE_DUMMY,
 		BIN,
 		IMPUTE,
 		OMIT,
@@ -98,6 +101,21 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 	}
 	
 	@Test
+	public void testHomesRecodeDummycodeIDsSingleNodeCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE_DUMMY, false);
+	}
+	
+	@Test
+	public void testHomesRecodeDummycodeIDsSparkCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE_DUMMY, false);
+	}
+	
+	@Test
+	public void testHomesRecodeDummycodeIDsHybridCSV() {
+		runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", TransformType.RECODE_DUMMY, false);
+	}
+	
+	@Test
 	public void testHomesBinningIDsSingleNodeCSV() {
 		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.BIN, false);
 	}
@@ -173,6 +191,21 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 	}
 	
 	@Test
+	public void testHomesRecodeDummycodeColnamesSingleNodeCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.RECODE_DUMMY, true);
+	}
+	
+	@Test
+	public void testHomesRecodeDummycodeColnamesSparkCSV() {
+		runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", TransformType.RECODE_DUMMY, true);
+	}
+	
+	@Test
+	public void testHomesRecodeDummycodeColnamesHybridCSV() {
+		runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", TransformType.RECODE_DUMMY, true);
+	}
+	
+	@Test
 	public void testHomesBinningColnamesSingleNodeCSV() {
 		runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", TransformType.BIN, true);
 	}
@@ -217,12 +250,6 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 		runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", TransformType.IMPUTE, true);
 	}
 	
-	/**
-	 * 
-	 * @param rt
-	 * @param ofmt
-	 * @param dataset
-	 */
 	private void runTransformTest( RUNTIME_PLATFORM rt, String ofmt, TransformType type, boolean colnames )
 	{
 		//set runtime platform
@@ -241,6 +268,7 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 			case BIN:    SPEC = colnames?SPEC3b:SPEC3; DATASET = DATASET1; break;
 			case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET = DATASET2; break;
 			case OMIT:   SPEC = colnames?SPEC5b:SPEC5; DATASET = DATASET2; break;
+			case RECODE_DUMMY: SPEC = colnames?SPEC6b:SPEC6; DATASET = DATASET1; break;
 		}
 
 		if( !ofmt.equals("csv") )

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy.json
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy.json
new file mode 100644
index 0000000..cb54cf3
--- /dev/null
+++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy.json
@@ -0,0 +1,2 @@
+{
+ "ids": true, "recode": [ 2, 1, 7 ], "dummycode": [ 1, 7 ] }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/816a900b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy2.json
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy2.json b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy2.json
new file mode 100644
index 0000000..872095b
--- /dev/null
+++ b/src/test/scripts/functions/transform/input/homes3/homes.tfspec_recode_dummy2.json
@@ -0,0 +1,3 @@
+{
+ "recode": [ "district", "zipcode", "view" ], 
+ "dummycode": [ "zipcode", "view"  ] }
\ No newline at end of file