You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2019/01/26 21:44:02 UTC
[systemml] branch master updated: [SYSTEMML-2509] Fix binning
support in transformencode over frames
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push:
new 3d09c4b [SYSTEMML-2509] Fix binning support in transformencode over frames
3d09c4b is described below
commit 3d09c4b1621ef8f7db3841da1e7d36d64298aef1
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sat Jan 26 22:43:41 2019 +0100
[SYSTEMML-2509] Fix binning support in transformencode over frames
This patch fixes missing binning support in transformencode over frames.
So far, only the apply was working properly but no meta data was build,
which corrupted the returned output matrix and meta data. Now, local CP
operations work as intended but distributed operations and sequences of
binning/dummy-coding require additional work.
---
.../sysml/runtime/transform/encode/EncoderBin.java | 114 +++++++++++----------
.../runtime/transform/encode/EncoderFactory.java | 8 +-
.../runtime/transform/encode/EncoderRecode.java | 2 +-
.../sysml/runtime/transform/meta/TfMetaUtils.java | 6 +-
.../transform/TransformEncodeDecodeTest.java | 1 -
.../transform/TransformFrameEncodeApplyTest.java | 16 ++-
.../transform/TransformFrameEncodeApply.dml | 1 -
7 files changed, 81 insertions(+), 67 deletions(-)
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
index 016adb4..2f94003 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
@@ -35,7 +35,7 @@ import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
public class EncoderBin extends Encoder
-{
+{
private static final long serialVersionUID = 1917445005206076078L;
public static final String MIN_PREFIX = "min";
@@ -43,70 +43,36 @@ public class EncoderBin extends Encoder
public static final String NBINS_PREFIX = "nbins";
private int[] _numBins = null;
- private double[] _min=null, _max=null; // min and max among non-missing values
//frame transform-apply attributes
+ //TODO binMins is redundant and could be removed
private double[][] _binMins = null;
private double[][] _binMaxs = null;
-
- public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen)
- throws JSONException, IOException
- {
- this(parsedSpec, colnames, clen, false);
- }
- public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly)
+ public EncoderBin(JSONObject parsedSpec, String[] colnames, int clen)
throws JSONException, IOException
{
- super( null, clen );
+ super( null, clen );
if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
return;
- if( colsOnly ) {
- List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
- initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
- }
- else
- {
- JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN);
- JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS);
- JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS);
- initColList(attrs);
-
- _numBins = new int[attrs.size()];
- for(int i=0; i < _numBins.length; i++)
- _numBins[i] = UtilFunctions.toInt(nbins.get(i));
-
- // initialize internal transformation metadata
- _min = new double[_colList.length];
- Arrays.fill(_min, Double.POSITIVE_INFINITY);
- _max = new double[_colList.length];
- Arrays.fill(_max, Double.NEGATIVE_INFINITY);
- }
- }
-
- public void prepare(String[] words, TfUtils agents) {
- if ( !isApplicable() )
- return;
+ //parse column names or column ids
+ List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
+ initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
- for(int i=0; i <_colList.length; i++) {
- int colID = _colList[i];
-
- String w = null;
- double d = 0;
-
- // equi-width
- w = UtilFunctions.unquote(words[colID-1].trim());
- if(!TfUtils.isNA(agents.getNAStrings(),w)) {
- d = UtilFunctions.parseToDouble(w);
- if(d < _min[i])
- _min[i] = d;
- if(d > _max[i])
- _max[i] = d;
- }
+ //parse number of bins per column
+ boolean ids = parsedSpec.containsKey("ids") && parsedSpec.getBoolean("ids");
+ JSONArray group = (JSONArray) parsedSpec.get(TfUtils.TXMETHOD_BIN);
+ _numBins = new int[collist.size()];
+ for(int i=0; i < _numBins.length; i++) {
+ JSONObject colspec = (JSONObject) group.get(i);
+ int pos = collist.indexOf(ids ? colspec.getInt("id") :
+ ArrayUtils.indexOf(colnames, colspec.get("name"))+1);
+ _numBins[pos] = colspec.containsKey("numbins") ?
+ colspec.getInt("numbins"): 1;
}
}
-
+
@Override
public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
build(in);
@@ -115,7 +81,30 @@ public class EncoderBin extends Encoder
@Override
public void build(FrameBlock in) {
- // nothing to do
+ if ( !isApplicable() )
+ return;
+ // initialize internal transformation metadata
+ _binMins = new double[_colList.length][];
+ _binMaxs = new double[_colList.length][];
+
+ // derive bin boundaries from min/max per column
+ for(int j=0; j <_colList.length; j++) {
+ double min = Double.POSITIVE_INFINITY;
+ double max = Double.NEGATIVE_INFINITY;
+ int colID = _colList[j];
+ for( int i=0; i<in.getNumRows(); i++ ) {
+ double inVal = UtilFunctions.objectToDouble(
+ in.getSchema()[colID-1], in.get(i, colID-1));
+ min = Math.min(min, inVal);
+ max = Math.max(max, inVal);
+ }
+ _binMins[j] = new double[_numBins[j]];
+ _binMaxs[j] = new double[_numBins[j]];
+ for(int i=0; i<_numBins[j]; i++) {
+ _binMins[j][i] = min + i*(max-min)/_numBins[j];
+ _binMaxs[j][i] = min + (i+1)*(max-min)/_numBins[j];
+ }
+ }
}
@Override
@@ -126,20 +115,35 @@ public class EncoderBin extends Encoder
double inVal = UtilFunctions.objectToDouble(
in.getSchema()[colID-1], in.get(i, colID-1));
int ix = Arrays.binarySearch(_binMaxs[j], inVal);
- int binID = ((ix < 0) ? Math.abs(ix+1) : ix) + 1;
+ int binID = ((ix < 0) ? Math.abs(ix+1) : ix) + 1;
out.quickSetValue(i, colID-1, binID);
- }
+ }
}
return out;
}
@Override
public FrameBlock getMetaData(FrameBlock meta) {
+ //serialize the internal state into frame meta data
+ for( int j=0; j<_colList.length; j++ ) {
+ int colID = _colList[j]; //1-based
+ meta.getColumnMetadata(colID-1).setNumDistinct(_numBins[j]);
+ for( int i=0; i<_binMaxs[j].length; i++ ) {
+ StringBuilder sb = new StringBuilder(16);
+ sb.append(_binMins[j][i]);
+ sb.append(Lop.DATATYPE_PREFIX);
+ sb.append(_binMaxs[j][i]);
+ meta.set(i, colID-1, sb.toString());
+ }
+ }
return meta;
}
@Override
public void initMetaData(FrameBlock meta) {
+ if( meta == null || _binMaxs != null )
+ return;
+ //deserialize the frame meta data into internal state
_binMins = new double[_colList.length][];
_binMaxs = new double[_colList.length][];
for( int j=0; j<_colList.length; j++ ) {
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 3914f11..3d2a100 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -62,7 +62,7 @@ public class EncoderFactory
List<Integer> dcIDs = Arrays.asList(ArrayUtils.toObject(
TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE)));
rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
- List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames);
+ List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames);
List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract(
CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs));
List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject(
@@ -74,15 +74,15 @@ public class EncoderFactory
if( !rcIDs.isEmpty() ) {
EncoderRecode ra = new EncoderRecode(jSpec, colnames, clen);
ra.setColList(ArrayUtils.toPrimitive(rcIDs.toArray(new Integer[0])));
- lencoders.add(ra);
+ lencoders.add(ra);
}
if( !ptIDs.isEmpty() )
lencoders.add(new EncoderPassThrough(
- ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));
+ ArrayUtils.toPrimitive(ptIDs.toArray(new Integer[0])), clen));
if( !dcIDs.isEmpty() )
lencoders.add(new EncoderDummycode(jSpec, colnames, schema.length));
if( !binIDs.isEmpty() )
- lencoders.add(new EncoderBin(jSpec, colnames, schema.length, true));
+ lencoders.add(new EncoderBin(jSpec, colnames, schema.length));
if( !oIDs.isEmpty() )
lencoders.add(new EncoderOmit(jSpec, colnames, schema.length));
if( !mvIDs.isEmpty() ) {
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index 11667ce..122d29d 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -215,7 +215,7 @@ public class EncoderRecode extends Encoder
* @return string array of token and code
*/
public static String[] splitRecodeMapEntry(String value) {
- // Instead of using splitCSV which is forcing string with RFC-4180 format,
+ // Instead of using splitCSV which is forcing string with RFC-4180 format,
// using Lop.DATATYPE_PREFIX separator to split token and code
int pos = value.toString().lastIndexOf(Lop.DATATYPE_PREFIX);
return new String[] {value.substring(0, pos), value.substring(pos+1)};
diff --git a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
index 2d89502..c3f3b34 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/meta/TfMetaUtils.java
@@ -101,7 +101,7 @@ public class TfMetaUtils
ids = true; //file-based transform outputs ids w/o id tags
}
else
- attrs = (JSONArray)spec.get(group);
+ attrs = (JSONArray)spec.get(group);
//construct ID list array
colList = new int[attrs.size()];
@@ -378,11 +378,11 @@ public class TfMetaUtils
try {
if( jSpec.containsKey(TfUtils.TXMETHOD_BIN) && jSpec.get(TfUtils.TXMETHOD_BIN) instanceof JSONArray ) {
return Arrays.asList(ArrayUtils.toObject(
- TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
+ TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
}
else { //internally generates
return Arrays.asList(ArrayUtils.toObject(
- TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
+ TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_BIN)));
}
}
catch(JSONException ex) {
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
index eeddfb2..9aed893 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformEncodeDecodeTest.java
@@ -23,7 +23,6 @@ import java.util.HashMap;
import java.util.Iterator;
import org.junit.Test;
-import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.runtime.io.FrameReader;
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
index a0343cf..c27a4a2 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplyTest.java
@@ -19,6 +19,7 @@
package org.apache.sysml.test.integration.functions.transform;
+import org.junit.Assert;
import org.junit.Test;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
@@ -292,12 +293,23 @@ public class TransformFrameEncodeApplyTest extends AutomatedTestBase
double[][] R2 = DataConverter.convertToDoubleMatrix(MatrixReaderFactory
.createMatrixReader(InputInfo.CSVInputInfo)
.readMatrixFromHDFS(output("tfout2"), -1L, -1L, 1000, 1000, -1));
- TestUtils.compareMatrices(R1, R2, R1.length, R1[0].length, 0);
+ TestUtils.compareMatrices(R1, R2, R1.length, R1[0].length, 0);
if( rt == RUNTIME_PLATFORM.HYBRID_SPARK ) {
- assertEquals("Wrong number of executed Spark instructions: " +
+ assertEquals("Wrong number of executed Spark instructions: " +
Statistics.getNoOfExecutedSPInst(), new Long(2), new Long(Statistics.getNoOfExecutedSPInst()));
}
+
+ //additional checks for binning as encode-decode impossible
+ //TODO fix distributed binning as well
+ if( type == TransformType.BIN && rt != RUNTIME_PLATFORM.SPARK ) {
+ int[] col3 = new int[]{1,4,2,3,3,2,4};
+ int[] col8 = new int[]{1,2,2,2,2,2,3};
+ for(int i=0; i<7; i++) {
+ Assert.assertEquals(col3[i], R1[i][2], 1e-8);
+ Assert.assertEquals(col8[i], R1[i][7], 1e-8);
+ }
+ }
}
catch(Exception ex) {
throw new RuntimeException(ex);
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
index f7be1aa..f4132d7 100644
--- a/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeApply.dml
@@ -20,7 +20,6 @@
#-------------------------------------------------------------
F1 = read($DATA, data_type="frame", format="csv");
-
jspec = read($TFSPEC, data_type="scalar", value_type="string");
[X, M] = transformencode(target=F1, spec=jspec);