You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2019/01/26 21:44:02 UTC

[systemml] branch master updated: [SYSTEMML-2509] Fix binning support in transformencode over frames

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 3d09c4b  [SYSTEMML-2509] Fix binning support in transformencode over frames
3d09c4b is described below

commit 3d09c4b1621ef8f7db3841da1e7d36d64298aef1
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Jan 26 22:43:41 2019 +0100

    [SYSTEMML-2509] Fix binning support in transformencode over frames
    
    This patch fixes missing binning support in transformencode over frames.
    So far, only the apply was working properly but no meta data was build,
    which corrupted the returned output matrix and meta data. Now, local CP
    operations work as intended but distributed operations and sequences of
    binning/dummy-coding require additional work.
---
 .../sysml/runtime/transform/encode/EncoderBin.java | 114 +++++++++++----------
 .../runtime/transform/encode/EncoderFactory.java   |   8 +-
 .../runtime/transform/encode/EncoderRecode.java    |   2 +-
 .../sysml/runtime/transform/meta/TfMetaUtils.java  |   6 +-
 .../transform/TransformEncodeDecodeTest.java       |   1 -
 .../transform/TransformFrameEncodeApplyTest.java   |  16 ++-
 .../transform/TransformFrameEncodeApply.dml        |   1 -
 7 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
index 016adb4..2f94003 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
@@ -35,7 +35,7 @@ import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class EncoderBin extends Encoder 
-{	
+{
 	private static final long serialVersionUID = 1917445005206076078L;
 
 	public static final String MIN_PREFIX = "min";
@@ -43,70 +43,36 @@ public class EncoderBin extends Encoder
 	public static final String NBINS_PREFIX = "nbins";
 
 	private int[] _numBins = null;
-	private double[] _min=null, _max=null;	// min and max among non-missing values
 	
 	//frame transform-apply attributes
+	//TODO binMins is redundant and could be removed
 	private double[][] _binMins = null;
 	private double[][] _binMaxs = null;
-	
-	public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) 
-		throws JSONException, IOException 
-	{
-		this(parsedSpec, colnames, clen, false);
-	}
 
-	public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly) 
+	public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen) 
 		throws JSONException, IOException 
 	{
-		super( null, clen );		
+		super( null, clen );
 		if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
 			return;
 		
-		if( colsOnly ) {
-			List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
-			initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
-		}
-		else 
-		{
-			JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN);		
-			JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS);
-			JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS);
-			initColList(attrs);
-			
-			_numBins = new int[attrs.size()];
-			for(int i=0; i < _numBins.length; i++)
-				_numBins[i] = UtilFunctions.toInt(nbins.get(i)); 
-			
-			// initialize internal transformation metadata
-			_min = new double[_colList.length];
-			Arrays.fill(_min, Double.POSITIVE_INFINITY);
-			_max = new double[_colList.length];
-			Arrays.fill(_max, Double.NEGATIVE_INFINITY);
-		}
-	}
-
-	public void prepare(String[] words, TfUtils agents) {
-		if ( !isApplicable() )
-			return;
+		//parse column names or column ids
+		List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
+		initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
 		
-		for(int i=0; i <_colList.length; i++) {
-			int colID = _colList[i];
-			
-			String w = null;
-			double d = 0;
-				
-			// equi-width
-			w = UtilFunctions.unquote(words[colID-1].trim());
-			if(!TfUtils.isNA(agents.getNAStrings(),w)) {
-				d = UtilFunctions.parseToDouble(w);
-				if(d < _min[i])
-					_min[i] = d;
-				if(d > _max[i])
-					_max[i] = d;
-			}
+		//parse number of bins per column
+		boolean ids = parsedSpec.containsKey("ids") && parsedSpec.getBoolean("ids");
+		JSONArray group = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_BIN);
+		_numBins = new int[collist.size()];
+		for(int i=0; i < _numBins.length; i++) {
+			JSONObject colspec = (JSONObject) group.get(i);
+			int pos = collist.indexOf(ids ? colspec.getInt("id") :
+				ArrayUtils.indexOf(colnames, colspec.get("name"))+1);
+			_numBins[pos] = colspec.containsKey("numbins") ?
+				colspec.getInt("numbins"): 1;
 		}
 	}
-		
+	
 	@Override
 	public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
 		build(in);
@@ -115,7 +81,30 @@ public class EncoderBin extends Encoder
 
 	@Override
 	public void build(FrameBlock in) {
-		// nothing to do
+		if ( !isApplicable() )
+			return;
+		// initialize internal transformation metadata
+		_binMins = new double[_colList.length][];
+		_binMaxs = new double[_colList.length][];
+		
+		// derive bin boundaries from min/max per column
+		for(int j=0; j <_colList.length; j++) {
+			double min = Double.POSITIVE_INFINITY;
+			double max = Double.NEGATIVE_INFINITY;
+			int colID = _colList[j];
+			for( int i=0; i<in.getNumRows(); i++ ) {
+				double inVal = UtilFunctions.objectToDouble(
+					in.getSchema()[colID-1], in.get(i, colID-1));
+				min = Math.min(min, inVal);
+				max = Math.max(max, inVal);
+			}
+			_binMins[j] = new double[_numBins[j]];
+			_binMaxs[j] = new double[_numBins[j]];
+			for(int i=0; i<_numBins[j]; i++) {
+				_binMins[j][i] = min + i*(max-min)/_numBins[j];
+				_binMaxs[j][i] = min + (i+1)*(max-min)/_numBins[j];
+			}
+		}
 	}
 	
 	@Override
@@ -126,20 +115,35 @@ public class EncoderBin extends Encoder
 				double inVal = UtilFunctions.objectToDouble(
 						in.getSchema()[colID-1], in.get(i, colID-1));
 				int ix = Arrays.binarySearch(_binMaxs[j], inVal);
-				int binID = ((ix < 0) ? Math.abs(ix+1) : ix) + 1;		
+				int binID = ((ix < 0) ? Math.abs(ix+1) : ix) + 1;
 				out.quickSetValue(i, colID-1, binID);
-			}	
+			}
 		}
 		return out;
 	}
 
 	@Override
 	public FrameBlock getMetaData(FrameBlock meta) {
+		//serialize the internal state into frame meta data
+		for( int j=0; j<_colList.length; j++ ) {
+			int colID = _colList[j]; //1-based
+			meta.getColumnMetadata(colID-1).setNumDistinct(_numBins[j]);
+			for( int i=0; i<_binMaxs[j].length; i++ ) {
+				StringBuilder sb = new StringBuilder(16);
+				sb.append(_binMins[j][i]);
+				sb.append(Lop.DATATYPE_PREFIX);
+				sb.append(_binMaxs[j][i]);
+				meta.set(i, colID-1, sb.toString());
+			}
+		}
 		return meta;
 	}
 	
 	@Override
 	public void initMetaData(FrameBlock meta) {
+		if( meta == null || _binMaxs != null )
+			return;
+		//deserialize the frame meta data into internal state
 		_binMins = new double[_colList.length][];
 		_binMaxs = new double[_colList.length][];
 		for( int j=0; j<_colList.length; j++ ) {
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 3914f11..3d2a100 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -62,7 +62,7 @@ public class EncoderFactory
 			List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject(
 					TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); 
 			rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
-			List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames); 
+			List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames);
 			List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract(
 					CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs)); 
 			List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject(
@@ -74,15 +74,15 @@ public class EncoderFactory
 			if( !rcIDs.isEmpty() ) {
 				EncoderRecode ra = new EncoderRecode(jSpec, colnames, clen);
 				ra.setColList(ArrayUtils.toPrimitive(rcIDs.toArray(new Integer[0])));
-				lencoders.add(ra);	
+				lencoders.add(ra);
 			}
 			if( !ptIDs.isEmpty() )
 				lencoders.add(new EncoderPassThrough(
-						ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));	
+					ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));
 			if( !dcIDs.isEmpty() )
 				lencoders.add(new EncoderDummycode(jSpec, colnames, schema.length));
 			if( !binIDs.isEmpty() )
-				lencoders.add(new EncoderBin(jSpec, colnames, schema.length, true));
+				lencoders.add(new EncoderBin(jSpec, colnames, schema.length));
 			if( !oIDs.isEmpty() )
 				lencoders.add(new EncoderOmit(jSpec, colnames, schema.length));
 			if( !mvIDs.isEmpty() ) {
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index 11667ce..122d29d 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -215,7 +215,7 @@ public class EncoderRecode extends Encoder
 	 * @return string array of token and code
 	 */
 	public static String[] splitRecodeMapEntry(String value) {
-		// Instead of using splitCSV which is forcing string with RFC-4180 format, 
+		// Instead of using splitCSV which is forcing string with RFC-4180 format,
 		// using Lop.DATATYPE_PREFIX separator to split token and code 
 		int pos = value.toString().lastIndexOf(Lop.DATATYPE_PREFIX);
 		return new String[] {value.substring(0, pos), value.substring(pos+1)};
diff --git a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
index 2d89502..c3f3b34 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
@@ -101,7 +101,7 @@ public class TfMetaUtils
 				ids = true; //file-based transform outputs ids w/o id tags
 			}
 			else
-				attrs = (JSONArray)spec.get(group);			
+				attrs = (JSONArray)spec.get(group);
 			
 			//construct ID list array
 			colList = new int[attrs.size()];
@@ -378,11 +378,11 @@ public class TfMetaUtils
 		try {
 			if( jSpec.containsKey(TfUtils.TXMETHOD_BIN) && jSpec.get(TfUtils.TXMETHOD_BIN) instanceof JSONArray ) {
 				return Arrays.asList(ArrayUtils.toObject(
-						TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));	
+						TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
 			}
 			else { //internally generates
 				return Arrays.asList(ArrayUtils.toObject(
-						TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));	
+						TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
 			}
 		}
 		catch(JSONException ex) {
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
index eeddfb2..9aed893 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
@@ -23,7 +23,6 @@ import java.util.HashMap;
 import java.util.Iterator;
 
 import org.junit.Test;
-import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
 import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.runtime.io.FrameReader;
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
index a0343cf..c27a4a2 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.test.integration.functions.transform;
 
+import org.junit.Assert;
 import org.junit.Test;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
@@ -292,12 +293,23 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
 			double[][] R2 = DataConverter.convertToDoubleMatrix(MatrixReaderFactory
 				.createMatrixReader(InputInfo.CSVInputInfo)
 				.readMatrixFromHDFS(output("tfout2"), -1L, -1L, 1000, 1000, -1));
-			TestUtils.compareMatrices(R1, R2, R1.length, R1[0].length, 0);		
+			TestUtils.compareMatrices(R1, R2, R1.length, R1[0].length, 0);
 			
 			if( rt == RUNTIME_PLATFORM.HYBRID_SPARK ) {
-				assertEquals("Wrong number of executed Spark instructions: " + 
+				assertEquals("Wrong number of executed Spark instructions: " +
 					Statistics.getNoOfExecutedSPInst(), new Long(2), new Long(Statistics.getNoOfExecutedSPInst()));
 			}
+			
+			//additional checks for binning as encode-decode impossible
+			//TODO fix distributed binning as well
+			if( type == TransformType.BIN && rt != RUNTIME_PLATFORM.SPARK ) {
+				int[] col3 = new int[]{1,4,2,3,3,2,4};
+				int[] col8 = new int[]{1,2,2,2,2,2,3};
+				for(int i=0; i<7; i++) {
+					Assert.assertEquals(col3[i], R1[i][2], 1e-8);
+					Assert.assertEquals(col8[i], R1[i][7], 1e-8);
+				}
+			}
 		}
 		catch(Exception ex) {
 			throw new RuntimeException(ex);
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
index f7be1aa..f4132d7 100644
--- a/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
@@ -20,7 +20,6 @@
 #-------------------------------------------------------------
 
 F1 = read($DATA, data_type="frame", format="csv");
-
 jspec = read($TFSPEC, data_type="scalar", value_type="string");
 
 [X, M] = transformencode(target=F1, spec=jspec);