You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by mb...@apache.org on 2017/04/12 06:21:26 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-1513] Additional unary/binary codegen row vector primitives

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 2bc266298 -> fb82482b0


[SYSTEMML-1513] Additional unary/binary codegen row vector primitives

This patch adds compiler and runtime support for the following commonly
used row vector primitives: abs, round, ceil, floor, sign, pow2, mult2,
sqrt, min, max, plus, pow. Furthermore, this also includes additional
tests for complex rowwise fusion patterns.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/f9f70b3a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/f9f70b3a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/f9f70b3a

Branch: refs/heads/master
Commit: f9f70b3a216a77414d81fe44dcb3a25cbc8a902d
Parents: 2bc2662
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Apr 11 20:40:07 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Apr 11 20:40:07 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  38 ++-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |  46 ++-
 .../hops/codegen/template/TemplateRow.java      |   8 +-
 .../runtime/codegen/LibSpoofPrimitives.java     | 324 ++++++++++++++++++-
 .../functions/codegen/RowAggTmplTest.java       |  34 +-
 .../scripts/functions/codegen/rowAggPattern14.R |  34 ++
 .../functions/codegen/rowAggPattern14.dml       |  28 ++
 .../scripts/functions/codegen/rowAggPattern15.R |  35 ++
 .../functions/codegen/rowAggPattern15.dml       |  29 ++
 9 files changed, 553 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index 4d54cd1..180d352 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -29,10 +29,12 @@ public class CNodeBinary extends CNode
 {
 	public enum BinType {
 		DOT_PRODUCT,
-		VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD,
+		VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD, VECT_PLUS_ADD,
+		VECT_POW_ADD, VECT_MIN_ADD, VECT_MAX_ADD,
 		VECT_EQUAL_ADD, VECT_NOTEQUAL_ADD, VECT_LESS_ADD, 
 		VECT_LESSEQUAL_ADD, VECT_GREATER_ADD, VECT_GREATEREQUAL_ADD,
-		VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_MINUS_SCALAR, 
+		VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_MINUS_SCALAR, VECT_PLUS_SCALAR,
+		VECT_POW_SCALAR, VECT_MIN_SCALAR, VECT_MAX_SCALAR,
 		VECT_EQUAL_SCALAR, VECT_NOTEQUAL_SCALAR, VECT_LESS_SCALAR, 
 		VECT_LESSEQUAL_SCALAR, VECT_GREATER_SCALAR, VECT_GREATEREQUAL_SCALAR,
 		MULT, DIV, PLUS, MINUS, MODULUS, INTDIV, 
@@ -61,6 +63,11 @@ public class CNodeBinary extends CNode
 				
 				case VECT_MULT_ADD:
 				case VECT_DIV_ADD:
+				case VECT_MINUS_ADD:
+				case VECT_PLUS_ADD:
+				case VECT_POW_ADD:
+				case VECT_MIN_ADD:
+				case VECT_MAX_ADD:	
 				case VECT_EQUAL_ADD:
 				case VECT_NOTEQUAL_ADD:
 				case VECT_LESS_ADD:
@@ -72,9 +79,13 @@ public class CNodeBinary extends CNode
 									"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
 				}
 				
+				case VECT_MULT_SCALAR:
 				case VECT_DIV_SCALAR:
 				case VECT_MINUS_SCALAR:
-				case VECT_MULT_SCALAR:
+				case VECT_PLUS_SCALAR:
+				case VECT_POW_SCALAR:
+				case VECT_MIN_SCALAR:
+				case VECT_MAX_SCALAR:	
 				case VECT_EQUAL_SCALAR:
 				case VECT_NOTEQUAL_SCALAR:
 				case VECT_LESS_SCALAR:
@@ -133,7 +144,10 @@ public class CNodeBinary extends CNode
 			}
 		}
 		public boolean isVectorScalarPrimitive() {
-			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR || this == VECT_MINUS_SCALAR
+			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR 
+				|| this == VECT_MINUS_SCALAR || this == VECT_PLUS_SCALAR
+				|| this == VECT_POW_SCALAR 
+				|| this == VECT_MIN_SCALAR || this == VECT_MAX_SCALAR
 				|| this == VECT_EQUAL_SCALAR || this == VECT_NOTEQUAL_SCALAR
 				|| this == VECT_LESS_SCALAR || this == VECT_LESSEQUAL_SCALAR
 				|| this == VECT_GREATER_SCALAR || this == VECT_GREATEREQUAL_SCALAR;
@@ -215,6 +229,10 @@ public class CNodeBinary extends CNode
 			case VECT_MULT_ADD: return "b(vma)";
 			case VECT_DIV_ADD: return "b(vda)";
 			case VECT_MINUS_ADD: return "b(vmia)";
+			case VECT_PLUS_ADD: return "b(vpa)";
+			case VECT_POW_ADD: return "b(vpowa)";
+			case VECT_MIN_ADD: return "b(vmina)";
+			case VECT_MAX_ADD: return "b(vmaxa)";
 			case VECT_EQUAL_ADD: return "b(veqa)";
 			case VECT_NOTEQUAL_ADD: return "b(vneqa)";
 			case VECT_LESS_ADD: return "b(vlta)";
@@ -224,6 +242,10 @@ public class CNodeBinary extends CNode
 			case VECT_MULT_SCALAR:  return "b(vm)";
 			case VECT_DIV_SCALAR:  return "b(vd)";
 			case VECT_MINUS_SCALAR:  return "b(vmi)";
+			case VECT_PLUS_SCALAR: return "b(vp)";
+			case VECT_POW_SCALAR: return "b(vpow)";
+			case VECT_MIN_SCALAR: return "b(vmin)";
+			case VECT_MAX_SCALAR: return "b(vmax)";
 			case VECT_EQUAL_SCALAR: return "b(veq)";
 			case VECT_NOTEQUAL_SCALAR: return "b(vneq)";
 			case VECT_LESS_SCALAR: return "b(vlt)";
@@ -259,6 +281,10 @@ public class CNodeBinary extends CNode
 			case VECT_MULT_ADD: 
 			case VECT_DIV_ADD:
 			case VECT_MINUS_ADD:
+			case VECT_PLUS_ADD:
+			case VECT_POW_ADD:
+			case VECT_MIN_ADD:
+			case VECT_MAX_ADD:
 			case VECT_EQUAL_ADD: 
 			case VECT_NOTEQUAL_ADD: 
 			case VECT_LESS_ADD: 
@@ -273,6 +299,10 @@ public class CNodeBinary extends CNode
 			case VECT_DIV_SCALAR: 	
 			case VECT_MULT_SCALAR:
 			case VECT_MINUS_SCALAR:
+			case VECT_PLUS_SCALAR:
+			case VECT_POW_SCALAR:
+			case VECT_MIN_SCALAR:
+			case VECT_MAX_SCALAR:
 			case VECT_EQUAL_SCALAR: 
 			case VECT_NOTEQUAL_SCALAR: 
 			case VECT_LESS_SCALAR: 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 437100f..30752a2 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -30,7 +30,8 @@ public class CNodeUnary extends CNode
 	public enum UnaryType {
 		LOOKUP_R, LOOKUP_RC, LOOKUP0, //codegen specific
 		ROW_SUMS, ROW_MINS, ROW_MAXS, //codegen specific
-		VECT_EXP_SCALAR, VECT_LOG_SCALAR,
+		VECT_EXP, VECT_POW2, VECT_MULT2, VECT_SQRT, VECT_LOG,
+		VECT_ABS, VECT_ROUND, VECT_CEIL, VECT_FLOOR, VECT_SIGN, 
 		EXP, POW2, MULT2, SQRT, LOG, LOG_NZ,
 		ABS, ROUND, CEIL, FLOOR, SIGN, 
 		SIN, COS, TAN, ASIN, ACOS, ATAN,
@@ -53,8 +54,16 @@ public class CNodeUnary extends CNode
 									"    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n"; 
 				}
 			
-				case VECT_EXP_SCALAR:
-				case VECT_LOG_SCALAR: {
+				case VECT_EXP:
+				case VECT_POW2:
+				case VECT_MULT2: 
+				case VECT_SQRT: 
+				case VECT_LOG:
+				case VECT_ABS:
+				case VECT_ROUND:
+				case VECT_CEIL:
+				case VECT_FLOOR:
+				case VECT_SIGN: {
 					String vectName = getVectorPrimitiveName();
 					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, %LEN%);\n" : 
 									"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
@@ -112,8 +121,11 @@ public class CNodeUnary extends CNode
 			}
 		}
 		public boolean isVectorScalarPrimitive() {
-			return this == UnaryType.VECT_EXP_SCALAR 
-				|| this == UnaryType.VECT_LOG_SCALAR;
+			return this == VECT_EXP || this == VECT_POW2
+				|| this == VECT_MULT2 || this == VECT_SQRT
+				|| this == VECT_LOG || this == VECT_ABS
+				|| this == VECT_ROUND || this == VECT_CEIL
+				|| this == VECT_FLOOR || this == VECT_SIGN;
 		}
 		public UnaryType getVectorAddPrimitive() {
 			return UnaryType.valueOf("VECT_"+getVectorPrimitiveName().toUpperCase()+"_ADD");
@@ -184,8 +196,16 @@ public class CNodeUnary extends CNode
 			case ROW_SUMS:  return "u(R+)";
 			case ROW_MINS:  return "u(Rmin)";
 			case ROW_MAXS:  return "u(Rmax)";
-			case VECT_EXP_SCALAR: return "u(vexp)";
-			case VECT_LOG_SCALAR: return "u(vlog)";
+			case VECT_EXP:
+			case VECT_POW2:
+			case VECT_MULT2: 
+			case VECT_SQRT: 
+			case VECT_LOG:
+			case VECT_ABS:
+			case VECT_ROUND:
+			case VECT_CEIL:
+			case VECT_FLOOR:
+			case VECT_SIGN: return "u(v"+_type.name().toLowerCase()+")";
 			case LOOKUP_R:	return "u(ixr)";
 			case LOOKUP_RC:	return "u(ixrc)";
 			case LOOKUP0:	return "u(ix0)";
@@ -197,8 +217,16 @@ public class CNodeUnary extends CNode
 	@Override
 	public void setOutputDims() {
 		switch(_type) {
-			case VECT_EXP_SCALAR:
-			case VECT_LOG_SCALAR:	
+			case VECT_EXP:
+			case VECT_POW2:
+			case VECT_MULT2: 
+			case VECT_SQRT: 
+			case VECT_LOG:
+			case VECT_ABS:
+			case VECT_ROUND:
+			case VECT_CEIL:
+			case VECT_FLOOR:
+			case VECT_SIGN:	
 				_rows = _inputs.get(0)._rows;
 				_cols = _inputs.get(0)._cols;
 				_dataType= DataType.MATRIX;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 2e1d9f8..ca9776d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -57,8 +57,10 @@ import org.apache.sysml.runtime.matrix.data.Pair;
 public class TemplateRow extends TemplateBase 
 {
 	private static final Hop.AggOp[] SUPPORTED_ROW_AGG = new AggOp[]{AggOp.SUM, AggOp.MIN, AggOp.MAX};
-	private static final Hop.OpOp1[] SUPPORTED_VECT_UNARY = new OpOp1[]{OpOp1.EXP, OpOp1.LOG};
-	private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new OpOp2[]{OpOp2.MULT, OpOp2.DIV, OpOp2.MINUS, 
+	private static final Hop.OpOp1[] SUPPORTED_VECT_UNARY = new OpOp1[]{
+			OpOp1.EXP, OpOp1.SQRT, OpOp1.LOG, OpOp1.ABS, OpOp1.ROUND, OpOp1.CEIL, OpOp1.FLOOR, OpOp1.SIGN};
+	private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new OpOp2[]{
+			OpOp2.MULT, OpOp2.DIV, OpOp2.MINUS, OpOp2.PLUS, OpOp2.POW, OpOp2.MIN, OpOp2.MAX,
 			OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.LESS, OpOp2.LESSEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
 	
 	public TemplateRow() {
@@ -216,7 +218,7 @@ public class TemplateRow extends TemplateBase
 			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 ) 
 			{
 				if( HopRewriteUtils.isUnary(hop, SUPPORTED_VECT_UNARY) ) {
-					String opname = "VECT_"+((UnaryOp)hop).getOp().name()+"_SCALAR";
+					String opname = "VECT_"+((UnaryOp)hop).getOp().name();
 					out = new CNodeUnary(cdata1, UnaryType.valueOf(opname));
 				}
 				else 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 9283c46..7a9adeb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -127,7 +127,7 @@ public class LibSpoofPrimitives
 		for( int i = ai; i < ai+len; i++ )
 			val = Math.min(a[i], val);
 		return val; 
-	} 
+	}
 	
 	public static double vectMin(double[] avals, int[] aix, int ai, int len) {
 		double val = Double.MAX_VALUE;
@@ -155,7 +155,7 @@ public class LibSpoofPrimitives
 	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  a[j] / bval;
-	} 
+	}
 
 	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++ )
@@ -181,7 +181,7 @@ public class LibSpoofPrimitives
 	public static void vectMinusAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  a[j] - bval;
-	} 
+	}
 
 	public static void vectMinusAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++ )
@@ -194,20 +194,124 @@ public class LibSpoofPrimitives
 			c[j] = a[ai] - bval;
 		return c;
 	}
-
+	
 	public static double[] vectMinusWrite(double[] a, double bval, int[] aix, int ai, int len) {
 		double[] c = allocVector(len, true);
 		for( int j = ai; j < ai+len; j++ )
 			c[aix[j]] = a[j] - bval;
 		return c;
 	}
+	
+	//custom vector plus
+	
+	public static void vectPlusAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  a[j] + bval;
+	}
+
+	public static void vectPlusAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += a[j] + bval;
+	}
+	
+	public static double[] vectPlusWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = a[ai] + bval;
+		return c;
+	}
+
+	public static double[] vectPlusWrite(double[] a, double bval, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = a[j] + bval;
+		return c;
+	}
+	
+	//custom vector pow
+	
+	public static void vectPowAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] += Math.pow(a[j], bval);
+	}
+
+	public static void vectPowAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.pow(a[j], bval);
+	}
+	
+	public static double[] vectPowWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.pow(a[ai], bval);
+		return c;
+	}
+
+	public static double[] vectPowWrite(double[] a, double bval, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.pow(a[j], bval);
+		return c;
+	}
+	
+	//custom vector min
+	
+	public static void vectMinAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] += Math.min(a[j], bval);
+	}
+
+	public static void vectMinAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.min(a[j], bval);
+	}
+	
+	public static double[] vectMinWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.min(a[ai], bval);
+		return c;
+	}
+
+	public static double[] vectMinWrite(double[] a, double bval, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.min(a[j], bval);
+		return c;
+	}
+	
+	//custom vector max
+	
+	public static void vectMaxAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] += Math.max(a[j], bval);
+	}
+
+	public static void vectMaxAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.max(a[j], bval);
+	}
+	
+	public static double[] vectMaxWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.max(a[ai], bval);
+		return c;
+	}
+
+	public static double[] vectMaxWrite(double[] a, double bval, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.max(a[j], bval);
+		return c;
+	}
 
 	//custom exp
 	
 	public static void vectExpAdd(double[] a, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  FastMath.exp(a[j]);
-	} 
+	}
 
 	public static void vectExpAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++ )
@@ -233,7 +337,7 @@ public class LibSpoofPrimitives
 	public static void vectLogAdd(double[] a, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  FastMath.log(a[j]);
-	} 
+	}
 
 	public static void vectLogAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++ )
@@ -254,6 +358,214 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	//custom abs
+	
+	public static void vectAbsAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  Math.abs(a[j]);
+	}
+
+	public static void vectAbsAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.log(a[j]);
+	}
+	
+	public static double[] vectAbsWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.log(a[ai]);
+		return c;
+	}
+
+	public static double[] vectAbsWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.log(a[j]);
+		return c;
+	}
+	
+	//custom round
+	
+	public static void vectRoundAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  Math.round(a[j]);
+	}
+
+	public static void vectRoundAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.round(a[j]);
+	}
+	
+	public static double[] vectRoundWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.round(a[ai]);
+		return c;
+	}
+
+	public static double[] vectRoundWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.round(a[j]);
+		return c;
+	}
+	
+	//custom ceil
+	
+	public static void vectCeilAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  FastMath.ceil(a[j]);
+	}
+
+	public static void vectCeilAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += FastMath.ceil(a[j]);
+	}
+	
+	public static double[] vectCeilWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = FastMath.ceil(a[ai]);
+		return c;
+	}
+
+	public static double[] vectCeilWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = FastMath.ceil(a[j]);
+		return c;
+	}
+	
+	//custom floor
+	
+	public static void vectFloorAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  FastMath.floor(a[j]);
+	}
+
+	public static void vectFloorAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += FastMath.floor(a[j]);
+	}
+	
+	public static double[] vectFloorWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = FastMath.floor(a[ai]);
+		return c;
+	}
+
+	public static double[] vectFloorWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = FastMath.floor(a[j]);
+		return c;
+	}
+	
+	//custom sign
+	
+	public static void vectSignAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  FastMath.signum(a[j]);
+	}
+
+	public static void vectSignAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += FastMath.signum(a[j]);
+	}
+	
+	public static double[] vectSignWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = FastMath.signum(a[ai]);
+		return c;
+	}
+
+	public static double[] vectSignWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = FastMath.signum(a[j]);
+		return c;
+	}
+	
+	//custom pow2
+	
+	public static void vectPow2Add(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  a[j] * a[j];
+	}
+
+	public static void vectPow2Add(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += a[j] * a[j];
+	}
+	
+	public static double[] vectPow2Write(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = a[ai] * a[ai];
+		return c;
+	}
+
+	public static double[] vectPow2Write(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = a[j] * a[j];
+		return c;
+	}
+	
+	//custom mult2
+	
+	public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  a[j] + a[j];
+	}
+
+	public static void vectMult2Add(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += a[j] + a[j];
+	}
+	
+	public static double[] vectMult2Write(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = a[ai] + a[ai];
+		return c;
+	}
+
+	public static double[] vectMult2Write(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = a[j] + a[j];
+		return c;
+	}
+	
+	//custom sqrt
+	
+	public static void vectSqrtAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  Math.sqrt(a[j]);
+	}
+
+	public static void vectSqrtAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += Math.sqrt(a[j]);
+	}
+	
+	public static double[] vectSqrtWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = Math.sqrt(a[ai]);
+		return c;
+	}
+
+	public static double[] vectSqrtWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = Math.sqrt(a[j]);
+		return c;
+	}
+	
 	//custom vector equal
 	
 	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 865080a..4037edb 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -49,6 +49,8 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME11 = TEST_NAME+"11"; //y - X %*% v
 	private static final String TEST_NAME12 = TEST_NAME+"12"; //Y=(X>=v); R=Y/rowSums(Y)
 	private static final String TEST_NAME13 = TEST_NAME+"13"; //rowSums(X)+rowSums(Y)
+	private static final String TEST_NAME14 = TEST_NAME+"14"; //colSums(max(floor(round(abs(min(sign(X+Y),1)))),7))
+	private static final String TEST_NAME15 = TEST_NAME+"15"; //systemml nn - softmax backward (partially)
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -60,7 +62,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=13; i++)
+		for(int i=1; i<=15; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -259,6 +261,36 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME13, false, ExecType.SPARK );
 	}
 	
+	@Test	
+	public void testCodegenRowAggRewrite14CP() {
+		testCodegenIntegration( TEST_NAME14, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg14CP() {
+		testCodegenIntegration( TEST_NAME14, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg14SP() {
+		testCodegenIntegration( TEST_NAME14, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite15CP() {
+		testCodegenIntegration( TEST_NAME15, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg15CP() {
+		testCodegenIntegration( TEST_NAME15, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg15SP() {
+		testCodegenIntegration( TEST_NAME15, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/test/scripts/functions/codegen/rowAggPattern14.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern14.R b/src/test/scripts/functions/codegen/rowAggPattern14.R
new file mode 100644
index 0000000..34589e1
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern14.R
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+
+X = matrix(seq(1,1500), 150, 10, byrow=TRUE);
+y = seq(1,150);
+
+Z = pmax(floor(round(abs(pmin(sign(X+y),1)))),7);
+R = t(colSums(Z));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/test/scripts/functions/codegen/rowAggPattern14.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern14.dml b/src/test/scripts/functions/codegen/rowAggPattern14.dml
new file mode 100644
index 0000000..f13c1ff
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern14.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(seq(1,1500), rows=150, cols=10);
+y = seq(1,150);
+
+Z = max(floor(round(abs(min(sign(X+y),1)))),7)
+R = colSums(Z); 
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/test/scripts/functions/codegen/rowAggPattern15.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern15.R b/src/test/scripts/functions/codegen/rowAggPattern15.R
new file mode 100644
index 0000000..a24679a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern15.R
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+
+X = matrix(seq(1,1500), 150, 10, byrow=TRUE);
+
+Y1 = X - rowMaxs(X) 
+Y2 = exp(Y1)
+Y3 = Y2 / rowSums(Y2)
+R = Y3 * rowSums(Y3)
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f9f70b3a/src/test/scripts/functions/codegen/rowAggPattern15.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern15.dml b/src/test/scripts/functions/codegen/rowAggPattern15.dml
new file mode 100644
index 0000000..d51397a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern15.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(seq(1,1500), rows=150, cols=10);
+
+Y1 = X - rowMaxs(X) 
+Y2 = exp(Y1)
+Y3 = Y2 / rowSums(Y2)
+R = Y3 * rowSums(Y3)
+
+write(R, $1)

[2/2] incubator-systemml git commit: [SYSTEMML-1514] Fix codegen cost estimation (two-level memoization)

Posted by mb...@apache.org.

[SYSTEMML-1514] Fix codegen cost estimation (two-level memoization)

This patch fixes the cost estimation of fusion plans for complex fused
operators with internal DAG structures, where we mistakenly double
counted compute costs. Similarly, we incorrectly double counted costs of
materialized intermediates. The core idea is a two-level memoization,
i.e., memoization for pairs of hops and cost vectors which allows a
proper memoization but at the same time the evaluation of costs for
overlapping fused operators with redundant computation. 

Additionally, this patch also hardens the compilation of
multi-aggregates to ensure matching input dimensions and to exclude
partial rowwise fusion plans that cover matrix multiplications.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/fb82482b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/fb82482b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/fb82482b

Branch: refs/heads/master
Commit: fb82482b09f851b428fc1c0e70994e9e6d94c007
Parents: f9f70b3
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Apr 11 23:08:30 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Apr 11 23:08:30 2017 -0700

----------------------------------------------------------------------
 .../template/PlanSelectionFuseCostBased.java    | 47 ++++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/fb82482b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
index 3c98090..8ba2490 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
@@ -31,6 +31,7 @@ import java.util.Iterator;
 import java.util.List;
 
 import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.hops.AggBinaryOp;
@@ -48,7 +49,7 @@ import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
-
+import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
 
 /**
  * This cost-based plan selection algorithm chooses fused operators
@@ -68,7 +69,8 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 	private static final double COMPUTE_BANDWIDTH = 2d*1024*1024*1024 //2GFLOPs/core
 		* InfrastructureAnalyzer.getLocalParallelism();
 	
-	private final static TemplateRow ROW_TPL = new TemplateRow();
+	private static final IDSequence COST_ID = new IDSequence();
+	private static final TemplateRow ROW_TPL = new TemplateRow();
 	
 	@Override
 	public void selectPlans(CPlanMemoTable memo, ArrayList<Hop> roots) 
@@ -315,10 +317,12 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 				LOG.trace(info);
 		}
 		
+		//filter aggregations w/ matmults to ensure consistent dims
 		//sort aggregations by num dependencies to simplify merging
 		//clusters of aggregations with parallel dependencies
-		aggInfos = aggInfos.stream().sorted(Comparator.comparing(
-			a -> a._inputAggs.size())).collect(Collectors.toList());
+		aggInfos = aggInfos.stream().filter(a -> !a.containsMatMult)
+			.sorted(Comparator.comparing(a -> a._inputAggs.size()))
+			.collect(Collectors.toList());
 		
 		//greedy grouping of multi-agg candidates
 		boolean converged = false;
@@ -409,6 +413,10 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 			aggInfo.addInputAggregate(current.getHopID());
 		}
 		
+		//collect included matrix multiplications
+		if( type != null && HopRewriteUtils.isMatrixMultiply(current) )
+			aggInfo.setContainsMatMult();
+		
 		//recursively process children
 		MemoTableEntry me = (type!=null) ? memo.getBest(current.getHopID()) : null;
 		for( int i=0; i< current.getInput().size(); i++ ) {
@@ -612,7 +620,7 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		//READ costs by the input sizes, and COMPUTE by operation specific FLOP
 		//counts times number of cells of main input, disregarding sparsity for now.
 		
-		HashSet<Long> visited = new HashSet<Long>();
+		HashSet<Pair<Long,Long>> visited = new HashSet<Pair<Long,Long>>();
 		double costs = 0;
 		for( Long hopID : R )
 			costs += rGetPlanCosts(memo, memo._hopRefs.get(hopID), 
@@ -620,11 +628,17 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		return costs;
 	}
 	
-	private static double rGetPlanCosts(CPlanMemoTable memo, Hop current, HashSet<Long> visited, HashSet<Long> partition, 
-			ArrayList<Long> M, boolean[] plan, HashMap<Long, Double> computeCosts, OperatorStats costsCurrent, TemplateType currentType) 
+	private static double rGetPlanCosts(CPlanMemoTable memo, Hop current, HashSet<Pair<Long,Long>> visited, HashSet<Long> partition, 
+			ArrayList<Long> M, boolean[] plan, HashMap<Long, Double> computeCosts, CostVector costsCurrent, TemplateType currentType) 
 	{
-		if( visited.contains(current.getHopID()) )
-			return 0; //dont double count 
+		//memoization per hop id and cost vector to account for redundant
+		//computation without double counting materialized results or compute
+		//costs of complex operation DAGs within a single fused operator
+		Pair<Long,Long> tag = Pair.of(current.getHopID(), 
+			(costsCurrent==null)?0:costsCurrent.ID);
+		if( visited.contains(tag) )
+			return 0; 
+		visited.add(tag);	
 		
 		//open template if necessary, including memoization
 		//under awareness of current plan choice
@@ -637,7 +651,6 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 					.filter(p -> hasNoRefToMaterialization(p, M, plan))
 					.min(new BasicPlanComparator()).orElse(null);
 				opened = true;
-				visited.add(current.getHopID());
 			}
 			else {
 				best = memo.get(current.getHopID()).stream()
@@ -649,8 +662,8 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		}
 		
 		//create new cost vector if opened, initialized with write costs
-		OperatorStats costVect = !opened ? costsCurrent : 
-			new OperatorStats(Math.max(current.getDim1(),1)*Math.max(current.getDim2(),1));
+		CostVector costVect = !opened ? costsCurrent : 
+			new CostVector(Math.max(current.getDim1(),1)*Math.max(current.getDim2(),1));
 		
 		//add compute costs of current operator to costs vector 
 		if( partition.contains(current.getHopID()) )
@@ -821,12 +834,14 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		return ret;
 	}
 	
-	private static class OperatorStats {
+	private static class CostVector {
+		public final long ID;
 		public final double outSize; 
 		public double computeCosts = 0;
 		public final HashMap<Long, Double> inSizes = new HashMap<Long, Double>();
 		
-		public OperatorStats(double outputSize) {
+		public CostVector(double outputSize) {
+			ID = COST_ID.getNextID();
 			outSize = outputSize;
 		}
 		public void addInputSize(long hopID, double inputSize) {
@@ -853,6 +868,7 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		public final HashMap<Long,Hop> _aggregates;
 		public final HashSet<Long> _inputAggs = new HashSet<Long>();
 		public final HashSet<Long> _fusedInputs = new HashSet<Long>();
+		public boolean containsMatMult = false;
 		public AggregateInfo(Hop aggregate) {
 			_aggregates = new HashMap<Long, Hop>();
 			_aggregates.put(aggregate.getHopID(), aggregate);
@@ -863,6 +879,9 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		public void addFusedInput(long hopID) {
 			_fusedInputs.add(hopID);
 		}
+		public void setContainsMatMult() {
+			containsMatMult = true;
+		}
 		public boolean isMergable(AggregateInfo that) {
 			//check independence
 			boolean ret = _aggregates.size()<3