You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/04/11 05:56:57 UTC

[2/2] incubator-systemml git commit: [SYSTEMML-1507, 1508] Generalized codegen rowwise template (agg types)

[SYSTEMML-1507,1508] Generalized codegen rowwise template (agg types)

So far the codegen row aggregate template only supported rowwise column
aggregations like colSums(X <= rowMins(X)) or t(X) %*% (w * (X %*% v)).
Similar to the existing cellwise template, this patch now generalizes
the row aggregate template to a rowwise templates with column
aggregation, row aggregation, or no aggregation. This enables fusion of
complex rowwise patterns, which is important for algorithms like Kmeans
and Mlogreg. For example, we are now able to fuse the following
expressions into a single rowwise operation:

* Example without aggregation (single operator)
P = (D <= rowMins (D));
P = P / rowSums (P);
 
* Example with row aggregation (single operator)
exp_LT = exp (LT - rowMaxs (LT));
V = log (rowSums (exp_LT));

Furthermore, this patch includes the following code generator extensions
and fixes:

* Validity checks for merging rowwise templates (common input)
* Pruning of invalid plans after decisions on materialization points
* New row vector primitives for exp, log, and minus
* New row vector primitives for vector writes 
* Fix min, max, and sparse sum row vector primitives 
* Fix missing import of FastMath for builtins in multi aggregates
* Fix check for supported binary operations in row templates
* Cleanup and extensions of hop rewrite utils
* Various new testcases for rowwise templates, incl spark operations
* Refactoring all class names of row aggregate to rowwise


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/eeb4f270
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/eeb4f270
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/eeb4f270

Branch: refs/heads/master
Commit: eeb4f2708d96d2b0741cfcd9a4a03f775b97cdc2
Parents: cfc73fe
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon Apr 10 12:53:13 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon Apr 10 22:53:11 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |   3 +-
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  17 +-
 .../sysml/hops/codegen/cplan/CNodeMultiAgg.java |   1 +
 .../sysml/hops/codegen/cplan/CNodeRow.java      | 173 ++++++++++
 .../sysml/hops/codegen/cplan/CNodeRowAgg.java   | 134 -------
 .../sysml/hops/codegen/cplan/CNodeTernary.java  |   7 +-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |  32 +-
 .../hops/codegen/template/CPlanMemoTable.java   |   2 +-
 .../hops/codegen/template/PlanSelection.java    |   2 +-
 .../template/PlanSelectionFuseCostBased.java    |  49 +++
 .../hops/codegen/template/TemplateBase.java     |   2 +-
 .../hops/codegen/template/TemplateRow.java      | 345 +++++++++++++++++++
 .../hops/codegen/template/TemplateRowAgg.java   | 325 -----------------
 .../hops/codegen/template/TemplateUtils.java    |  55 ++-
 .../sysml/hops/rewrite/HopRewriteUtils.java     |  14 +-
 .../runtime/codegen/LibSpoofPrimitives.java     | 109 ++++--
 .../runtime/codegen/SpoofRowAggregate.java      | 208 -----------
 .../sysml/runtime/codegen/SpoofRowwise.java     | 285 +++++++++++++++
 .../instructions/spark/SpoofSPInstruction.java  |  50 ++-
 .../functions/codegen/RowAggTmplTest.java       | 174 ++++++++--
 .../scripts/functions/codegen/rowAggPattern11.R |  34 ++
 .../functions/codegen/rowAggPattern11.dml       |  28 ++
 .../scripts/functions/codegen/rowAggPattern12.R |  34 ++
 .../functions/codegen/rowAggPattern12.dml       |  28 ++
 .../scripts/functions/codegen/rowAggPattern13.R |  33 ++
 .../functions/codegen/rowAggPattern13.dml       |  27 ++
 26 files changed, 1410 insertions(+), 761 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 3dfb452..fdb8d9d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -440,7 +440,8 @@ public class SpoofCompiler
 				Hop input2 = hop.getInput().get(k);
 				if( memo.contains(input2.getHopID()) && !memo.get(input2.getHopID()).get(0).closed
 					&& TemplateUtils.isType(memo.get(input2.getHopID()).get(0).type, tpl.getType(), TemplateType.CellTpl)
-					&& tpl.merge(hop, input2) ) 
+					&& tpl.merge(hop, input2) && (tpl.getType()!=TemplateType.RowTpl || pos==-1 
+						|| TemplateUtils.hasCommonRowTemplateMatrixInput(hop.getInput().get(pos), input2, memo)))
 					P.crossProduct(k, -1L, input2.getHopID());
 				else
 					P.crossProduct(k, -1L);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index b6b6ce5..4d54cd1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -29,10 +29,12 @@ public class CNodeBinary extends CNode
 {
 	public enum BinType {
 		DOT_PRODUCT,
-		VECT_MULT_ADD, VECT_DIV_ADD, VECT_EQUAL_ADD, VECT_NOTEQUAL_ADD, 
-		VECT_LESS_ADD, VECT_LESSEQUAL_ADD, VECT_GREATER_ADD, VECT_GREATEREQUAL_ADD,
-		VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_EQUAL_SCALAR, VECT_NOTEQUAL_SCALAR, 
-		VECT_LESS_SCALAR, VECT_LESSEQUAL_SCALAR, VECT_GREATER_SCALAR, VECT_GREATEREQUAL_SCALAR,
+		VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD,
+		VECT_EQUAL_ADD, VECT_NOTEQUAL_ADD, VECT_LESS_ADD, 
+		VECT_LESSEQUAL_ADD, VECT_GREATER_ADD, VECT_GREATEREQUAL_ADD,
+		VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_MINUS_SCALAR, 
+		VECT_EQUAL_SCALAR, VECT_NOTEQUAL_SCALAR, VECT_LESS_SCALAR, 
+		VECT_LESSEQUAL_SCALAR, VECT_GREATER_SCALAR, VECT_GREATEREQUAL_SCALAR,
 		MULT, DIV, PLUS, MINUS, MODULUS, INTDIV, 
 		LESS, LESSEQUAL, GREATER, GREATEREQUAL, EQUAL,NOTEQUAL,
 		MIN, MAX, AND, OR, LOG, LOG_NZ, POW,
@@ -71,6 +73,7 @@ public class CNodeBinary extends CNode
 				}
 				
 				case VECT_DIV_SCALAR:
+				case VECT_MINUS_SCALAR:
 				case VECT_MULT_SCALAR:
 				case VECT_EQUAL_SCALAR:
 				case VECT_NOTEQUAL_SCALAR:
@@ -130,7 +133,7 @@ public class CNodeBinary extends CNode
 			}
 		}
 		public boolean isVectorScalarPrimitive() {
-			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR
+			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR || this == VECT_MINUS_SCALAR
 				|| this == VECT_EQUAL_SCALAR || this == VECT_NOTEQUAL_SCALAR
 				|| this == VECT_LESS_SCALAR || this == VECT_LESSEQUAL_SCALAR
 				|| this == VECT_GREATER_SCALAR || this == VECT_GREATEREQUAL_SCALAR;
@@ -211,6 +214,7 @@ public class CNodeBinary extends CNode
 			case DOT_PRODUCT: return "b(dot)";
 			case VECT_MULT_ADD: return "b(vma)";
 			case VECT_DIV_ADD: return "b(vda)";
+			case VECT_MINUS_ADD: return "b(vmia)";
 			case VECT_EQUAL_ADD: return "b(veqa)";
 			case VECT_NOTEQUAL_ADD: return "b(vneqa)";
 			case VECT_LESS_ADD: return "b(vlta)";
@@ -219,6 +223,7 @@ public class CNodeBinary extends CNode
 			case VECT_GREATER_ADD: return "b(vgta)";
 			case VECT_MULT_SCALAR:  return "b(vm)";
 			case VECT_DIV_SCALAR:  return "b(vd)";
+			case VECT_MINUS_SCALAR:  return "b(vmi)";
 			case VECT_EQUAL_SCALAR: return "b(veq)";
 			case VECT_NOTEQUAL_SCALAR: return "b(vneq)";
 			case VECT_LESS_SCALAR: return "b(vlt)";
@@ -253,6 +258,7 @@ public class CNodeBinary extends CNode
 			//VECT
 			case VECT_MULT_ADD: 
 			case VECT_DIV_ADD:
+			case VECT_MINUS_ADD:
 			case VECT_EQUAL_ADD: 
 			case VECT_NOTEQUAL_ADD: 
 			case VECT_LESS_ADD: 
@@ -266,6 +272,7 @@ public class CNodeBinary extends CNode
 				
 			case VECT_DIV_SCALAR: 	
 			case VECT_MULT_SCALAR:
+			case VECT_MINUS_SCALAR:
 			case VECT_EQUAL_SCALAR: 
 			case VECT_NOTEQUAL_SCALAR: 
 			case VECT_LESS_SCALAR: 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
index d9502be..95e1f75 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
@@ -35,6 +35,7 @@ public class CNodeMultiAgg extends CNodeTpl
 			+ "import org.apache.sysml.runtime.codegen.SpoofMultiAggregate;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofCellwise;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofCellwise.AggOp;\n"
+			+ "import org.apache.commons.math3.util.FastMath;\n"
 			+ "\n"
 			+ "public final class %TMP% extends SpoofMultiAggregate { \n"
 			+ "  public %TMP%() {\n"

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
new file mode 100644
index 0000000..3cc2e3b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
+
+public class CNodeRow extends CNodeTpl
+{
+	private static final String TEMPLATE = 
+			  "package codegen;\n"
+			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;\n"
+			+ "import org.apache.commons.math3.util.FastMath;\n"
+			+ "\n"
+			+ "public final class %TMP% extends SpoofRowwise { \n"
+			+ "  public %TMP%() {\n"
+			+ "    super(RowType.%TYPE%, %VECT_MEM%);\n"
+			+ "  }\n"
+			+ "  protected void genexecRowDense( double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
+			+ "%BODY_dense%"
+			+ "  }\n"
+			+ "  protected void genexecRowSparse( double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
+			+ "%BODY_sparse%"
+			+ "  }\n"			
+			+ "}\n";
+
+	private static final String TEMPLATE_ROWAGG_OUT = "    c[rowIndex] = %IN%;\n";
+	private static final String TEMPLATE_NOAGG_OUT = "    LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*len, len);\n";
+	
+	public CNodeRow(ArrayList<CNode> inputs, CNode output ) {
+		super(inputs, output);
+	}
+	
+	private RowType _type = null; //access pattern 
+	private int _numVectors = -1; //number of intermediate vectors
+	
+	public void setNumVectorIntermediates(int num) {
+		_numVectors = num;
+	}
+	
+	public int getNumVectorIntermediates() {
+		return _numVectors;
+	}
+	
+	public void setRowType(RowType type) {
+		_type = type;
+		_hash = 0;
+	}
+	
+	public RowType getRowType() {
+		return _type;
+	}
+	
+	@Override
+	public String codegen(boolean sparse) {
+		// note: ignore sparse flag, generate both
+		String tmp = TEMPLATE;
+		
+		//rename inputs
+		rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
+		renameInputs(_inputs, 1);
+		
+		//generate dense/sparse bodies
+		String tmpDense = _output.codegen(false)
+			+ getOutputStatement(_output.getVarname());
+		_output.resetGenerated();
+		String tmpSparse = _output.codegen(true)
+			+ getOutputStatement(_output.getVarname());
+		tmp = tmp.replaceAll("%TMP%", createVarname());
+		tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+		tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
+		
+		//replace outputs 
+		tmp = tmp.replaceAll("%OUT%", "c");
+		tmp = tmp.replaceAll("%POSOUT%", "0");
+		
+		//replace size information
+		tmp = tmp.replaceAll("%LEN%", "len");
+		
+		//replace colvector information and number of vector intermediates
+		tmp = tmp.replaceAll("%TYPE%", _type.name());
+		tmp = tmp.replaceAll("%VECT_MEM%", String.valueOf(_numVectors));
+		
+		return tmp;
+	}
+	
+	private String getOutputStatement(String varName) {
+		if( !_type.isColumnAgg() ) {
+			String tmp = (_type==RowType.NO_AGG) ?
+				TEMPLATE_NOAGG_OUT : TEMPLATE_ROWAGG_OUT;
+			return tmp.replace("%IN%", varName);
+		}
+		return "";
+	}
+
+	@Override
+	public void setOutputDims() {
+		// TODO Auto-generated method stub
+		
+	}
+
+	@Override
+	public SpoofOutputDimsType getOutputDimType() {
+		return (_output._cols==1) ? 
+			SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
+			SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
+	}
+	
+	@Override
+	public CNodeTpl clone() {
+		CNodeRow tmp = new CNodeRow(_inputs, _output);
+		tmp.setRowType(_type);
+		tmp.setNumVectorIntermediates(_numVectors);
+		return tmp;
+	}
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = _type.hashCode();
+			int h3 = _numVectors;
+			_hash = Arrays.hashCode(new int[]{h1,h2,h3});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		if(!(o instanceof CNodeRow))
+			return false;
+		
+		CNodeRow that = (CNodeRow)o;
+		return super.equals(o)
+			&& _type == that._type
+			&& _numVectors == that._numVectors	
+			&& equalInputReferences(
+				_output, that._output, _inputs, that._inputs);
+	}
+	
+	@Override
+	public String getTemplateInfo() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("SPOOF ROWAGGREGATE [type=");
+		sb.append(_type.name());
+		sb.append(", reqVectMem=");
+		sb.append(_numVectors);
+		sb.append("]");
+		return sb.toString();
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
deleted file mode 100644
index 846c88d..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.cplan;
-
-import java.util.ArrayList;
-
-import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
-
-public class CNodeRowAgg extends CNodeTpl
-{
-	private static final String TEMPLATE = 
-			  "package codegen;\n"
-			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
-			+ "import org.apache.sysml.runtime.codegen.SpoofRowAggregate;\n"
-			+ "\n"
-			+ "public final class %TMP% extends SpoofRowAggregate { \n"
-			+ "  public %TMP%() {\n"
-			+ "    super(%COL_VECTOR%, %VECT_MEM%);\n"
-			+ "  }\n"
-			+ "  protected void genexecRowDense( double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
-			+ "%BODY_dense%"
-			+ "  }\n"
-			+ "  protected void genexecRowSparse( double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
-			+ "%BODY_sparse%"
-			+ "  }\n"			
-			+ "}\n";
-
-	public CNodeRowAgg(ArrayList<CNode> inputs, CNode output ) {
-		super(inputs, output);
-	}
-	
-	//number of intermediate vectors
-	private int _numVectors = -1;
-	
-	public void setNumVectorIntermediates(int num) {
-		_numVectors = num;
-	}
-	
-	public int getNumVectorIntermediates() {
-		return _numVectors;
-	}
-	
-	@Override
-	public String codegen(boolean sparse) {
-		// note: ignore sparse flag, generate both
-		String tmp = TEMPLATE;
-		
-		//rename inputs
-		rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
-		renameInputs(_inputs, 1);
-		
-		//generate dense/sparse bodies
-		String tmpDense = _output.codegen(false);
-		_output.resetGenerated();
-		String tmpSparse = _output.codegen(true);
-		tmp = tmp.replaceAll("%TMP%", createVarname());
-		tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
-		tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
-		
-		//replace outputs 
-		tmp = tmp.replaceAll("%OUT%", "c");
-		tmp = tmp.replaceAll("%POSOUT%", "0");
-		
-		//replace size information
-		tmp = tmp.replaceAll("%LEN%", "len");
-		
-		//replace colvector information and number of vector intermediates
-		tmp = tmp.replaceAll("%COL_VECTOR%", String.valueOf(_output._cols==1));
-		tmp = tmp.replaceAll("%VECT_MEM%", String.valueOf(_numVectors));
-		
-		return tmp;
-	}
-
-	@Override
-	public void setOutputDims() {
-		// TODO Auto-generated method stub
-		
-	}
-
-	@Override
-	public SpoofOutputDimsType getOutputDimType() {
-		return (_output._cols==1) ? 
-			SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
-			SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
-	}
-	
-	@Override
-	public CNodeTpl clone() {
-		return new CNodeRowAgg(_inputs, _output);
-	}
-	
-	@Override
-	public int hashCode() {
-		return super.hashCode();
-	}
-	
-	@Override 
-	public boolean equals(Object o) {
-		if(!(o instanceof CNodeRowAgg))
-			return false;
-		
-		CNodeRowAgg that = (CNodeRowAgg)o;
-		return super.equals(o)
-			&& _numVectors == that._numVectors	
-			&& equalInputReferences(
-				_output, that._output, _inputs, that._inputs);
-	}
-	
-	@Override
-	public String getTemplateInfo() {
-		StringBuilder sb = new StringBuilder();
-		sb.append("SPOOF ROWAGGREGATE [reqVectMem=");
-		sb.append(_numVectors);
-		sb.append("]");
-		return sb.toString();
-	}
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
index b3b1942..2a868f8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
@@ -54,7 +54,9 @@ public class CNodeTernary extends CNode
 					return "    double %TMP% = Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
 					
 				case LOOKUP_RC1:
-					return "    double %TMP% = getValue(%IN1%, rowIndex*%IN2%+%IN3%-1);\n";	
+					return sparse ?
+							"    double %TMP% = getValue(%IN1v%, rowIndex*%IN2%+%IN3%-1);\n" :	
+							"    double %TMP% = getValue(%IN1%, rowIndex*%IN2%+%IN3%-1);\n";	
 					
 				default: 
 					throw new RuntimeException("Invalid ternary type: "+this.toString());
@@ -94,6 +96,9 @@ public class CNodeTernary extends CNode
 		tmp = tmp.replaceAll("%TMP%", var);
 		for( int j=1; j<=3; j++ ) {
 			String varj = _inputs.get(j-1).getVarname();
+			//replace sparse and dense inputs
+			tmp = tmp.replaceAll("%IN"+j+"v%", 
+				varj+(varj.startsWith("b")?"":"vals") );
 			tmp = tmp.replaceAll("%IN"+j+"%", varj );
 		}
 		sb.append(tmp);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 025033b..437100f 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -30,6 +30,7 @@ public class CNodeUnary extends CNode
 	public enum UnaryType {
 		LOOKUP_R, LOOKUP_RC, LOOKUP0, //codegen specific
 		ROW_SUMS, ROW_MINS, ROW_MAXS, //codegen specific
+		VECT_EXP_SCALAR, VECT_LOG_SCALAR,
 		EXP, POW2, MULT2, SQRT, LOG, LOG_NZ,
 		ABS, ROUND, CEIL, FLOOR, SIGN, 
 		SIN, COS, TAN, ASIN, ACOS, ATAN,
@@ -46,10 +47,19 @@ public class CNodeUnary extends CNode
 			switch( this ) {
 				case ROW_SUMS:
 				case ROW_MINS:
-				case ROW_MAXS:
+				case ROW_MAXS: {
 					String vectName = StringUtils.capitalize(this.toString().substring(4,7).toLowerCase());
 					return sparse ? "    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, %LEN%);\n": 
 									"    double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n"; 
+				}
+			
+				case VECT_EXP_SCALAR:
+				case VECT_LOG_SCALAR: {
+					String vectName = getVectorPrimitiveName();
+					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, %LEN%);\n" : 
+									"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, %LEN%);\n";
+				}
+					
 				case EXP:
 					return "    double %TMP% = FastMath.exp(%IN1%);\n";
 			    case LOOKUP_R:
@@ -101,6 +111,17 @@ public class CNodeUnary extends CNode
 					throw new RuntimeException("Invalid unary type: "+this.toString());
 			}
 		}
+		public boolean isVectorScalarPrimitive() {
+			return this == UnaryType.VECT_EXP_SCALAR 
+				|| this == UnaryType.VECT_LOG_SCALAR;
+		}
+		public UnaryType getVectorAddPrimitive() {
+			return UnaryType.valueOf("VECT_"+getVectorPrimitiveName().toUpperCase()+"_ADD");
+		}
+		public String getVectorPrimitiveName() {
+			String [] tmp = this.name().split("_");
+			return StringUtils.capitalize(tmp[1].toLowerCase());
+		}
 	}
 	
 	private UnaryType _type;
@@ -163,6 +184,8 @@ public class CNodeUnary extends CNode
 			case ROW_SUMS:  return "u(R+)";
 			case ROW_MINS:  return "u(Rmin)";
 			case ROW_MAXS:  return "u(Rmax)";
+			case VECT_EXP_SCALAR: return "u(vexp)";
+			case VECT_LOG_SCALAR: return "u(vlog)";
 			case LOOKUP_R:	return "u(ixr)";
 			case LOOKUP_RC:	return "u(ixrc)";
 			case LOOKUP0:	return "u(ix0)";
@@ -174,6 +197,13 @@ public class CNodeUnary extends CNode
 	@Override
 	public void setOutputDims() {
 		switch(_type) {
+			case VECT_EXP_SCALAR:
+			case VECT_LOG_SCALAR:	
+				_rows = _inputs.get(0)._rows;
+				_cols = _inputs.get(0)._cols;
+				_dataType= DataType.MATRIX;
+				break;
+			
 			case ROW_SUMS:
 			case ROW_MINS:
 			case ROW_MAXS:

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index d306f0f..8f0a8fb 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -253,7 +253,7 @@ public class CPlanMemoTable
 	
 	public static class MemoTableEntry 
 	{
-		public final TemplateType type;
+		public TemplateType type;
 		public final long input1; 
 		public final long input2;
 		public final long input3;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
index 142040b..85126da 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
@@ -58,7 +58,7 @@ public abstract class PlanSelection
 	protected static boolean isValid(MemoTableEntry me, Hop hop) {
 		return (me.type == TemplateType.OuterProdTpl 
 				&& (me.closed || HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)))
-			|| (me.type == TemplateType.RowAggTpl && me.closed)	
+			|| (me.type == TemplateType.RowTpl)	
 			|| (me.type == TemplateType.CellTpl)
 			|| (me.type == TemplateType.MultiAggTpl);
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
index a9cd90e..3c98090 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
@@ -68,6 +68,8 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 	private static final double COMPUTE_BANDWIDTH = 2d*1024*1024*1024 //2GFLOPs/core
 		* InfrastructureAnalyzer.getLocalParallelism();
 	
+	private final static TemplateRow ROW_TPL = new TemplateRow();
+	
 	@Override
 	public void selectPlans(CPlanMemoTable memo, ArrayList<Hop> roots) 
 	{
@@ -466,6 +468,11 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 			for( Long hopID : R )
 				rPruneSuboptimalPlans(memo, memo._hopRefs.get(hopID), 
 					visited, partition, M, bestPlan);
+			HashSet<Long> visited2 = new HashSet<Long>();
+			for( Long hopID : R )
+				rPruneInvalidPlans(memo, memo._hopRefs.get(hopID), 
+					visited2, partition, M, bestPlan);
+			
 			for( Long hopID : R )
 				rSelectPlansFuseAll(memo, 
 					memo._hopRefs.get(hopID), null, partition);
@@ -498,6 +505,48 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		visited.add(current.getHopID());		
 	}
 	
+	private static void rPruneInvalidPlans(CPlanMemoTable memo, Hop current, HashSet<Long> visited, HashSet<Long> partition, ArrayList<Long> M, boolean[] plan) {
+		//memoization (not via hops because in middle of dag)
+		if( visited.contains(current.getHopID()) )
+			return;
+		
+		//process children recursively
+		for( Hop c : current.getInput() )
+			rPruneInvalidPlans(memo, c, visited, partition, M, plan);
+		
+		//find invalid row aggregate leaf nodes (see TemplateRow.open) w/o matrix inputs, 
+		//i.e., plans that become invalid after the previous pruning step
+		long hopID = current.getHopID();
+		if( partition.contains(hopID) && memo.contains(hopID, TemplateType.RowTpl) ) {
+			for( MemoTableEntry me : memo.get(hopID) ) {
+				if( me.type==TemplateType.RowTpl ) {
+					//convert leaf node with pure vector inputs
+					if( !me.hasPlanRef() && !TemplateUtils.hasMatrixInput(current) ) {
+						me.type = TemplateType.CellTpl;
+						if( LOG.isTraceEnabled() )
+							LOG.trace("Converted leaf memo table entry from row to cell: "+me);
+					}
+					
+					//convert inner node without row template input
+					if( me.hasPlanRef() && !ROW_TPL.open(current) ) {
+						boolean hasRowInput = false;
+						for( int i=0; i<3; i++ )
+							if( me.isPlanRef(i) )
+								hasRowInput |= memo.contains(me.input(i), TemplateType.RowTpl);
+						if( !hasRowInput ) {
+							me.type = TemplateType.CellTpl;
+							if( LOG.isTraceEnabled() )
+								LOG.trace("Converted inner memo table entry from row to cell: "+me);	
+						}
+					}
+					
+				}
+			}
+		}
+		
+		visited.add(current.getHopID());		
+	}
+	
 	private void rSelectPlansFuseAll(CPlanMemoTable memo, Hop current, TemplateType currentType, HashSet<Long> partition) 
 	{	
 		if( isVisited(current.getHopID(), currentType) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
index 4fceb8a..8ed52f6 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
@@ -28,7 +28,7 @@ public abstract class TemplateBase
 	public enum TemplateType {
 		//ordering specifies type preferences
 		MultiAggTpl,
-		RowAggTpl,
+		RowTpl,
 		OuterProdTpl,
 		CellTpl;
 		public int getRank() {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
new file mode 100644
index 0000000..5e48e44
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.sysml.hops.AggBinaryOp;
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.IndexingOp;
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.ParameterizedBuiltinOp;
+import org.apache.sysml.hops.TernaryOp;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeRow;
+import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.OpOp1;
+import org.apache.sysml.hops.Hop.OpOp2;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public class TemplateRow extends TemplateBase 
+{
+	private static final Hop.AggOp[] SUPPORTED_ROW_AGG = new AggOp[]{AggOp.SUM, AggOp.MIN, AggOp.MAX};
+	private static final Hop.OpOp1[] SUPPORTED_VECT_UNARY = new OpOp1[]{OpOp1.EXP, OpOp1.LOG};
+	private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new OpOp2[]{OpOp2.MULT, OpOp2.DIV, OpOp2.MINUS, 
+			OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.LESS, OpOp2.LESSEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
+	
+	public TemplateRow() {
+		super(TemplateType.RowTpl);
+	}
+	
+	public TemplateRow(boolean closed) {
+		super(TemplateType.RowTpl, closed);
+	}
+	
+	@Override
+	public boolean open(Hop hop) {
+		return (hop instanceof AggBinaryOp && hop.getDim2()==1
+			&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
+			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
+				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1);
+	}
+
+	@Override
+	public boolean fuse(Hop hop, Hop input) {
+		return !isClosed() && 
+			(  (hop instanceof BinaryOp && TemplateUtils.isOperationSupported(hop) 
+				&& (HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
+					|| HopRewriteUtils.isBinaryMatrixScalarOperation(hop)) ) 
+			|| ((hop instanceof UnaryOp || hop instanceof ParameterizedBuiltinOp) 
+					&& TemplateCell.isValidOperation(hop))		
+			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol)
+			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+	}
+
+	@Override
+	public boolean merge(Hop hop, Hop input) {
+		//merge rowagg tpl with cell tpl if input is a vector
+		return !isClosed() &&
+			((hop instanceof BinaryOp && input.getDim2()==1 //matrix-scalar/vector-vector ops )
+				&& TemplateUtils.isOperationSupported(hop))
+			 ||(hop instanceof AggBinaryOp && input.getDim2()==1
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+	}
+
+	@Override
+	public CloseType close(Hop hop) {
+		//close on column aggregate (e.g., colSums, t(X)%*%y)
+		if( hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()==Direction.Col
+			|| (hop instanceof AggBinaryOp && HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))) )
+			return CloseType.CLOSED_VALID;
+		else
+			return CloseType.OPEN;
+	}
+
+	@Override
+	public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable memo, boolean compileLiterals) {
+		//recursively process required cplan output
+		HashSet<Hop> inHops = new HashSet<Hop>();
+		HashMap<String, Hop> inHops2 = new HashMap<String,Hop>();
+		HashMap<Long, CNode> tmp = new HashMap<Long, CNode>();
+		hop.resetVisitStatus();
+		rConstructCplan(hop, memo, tmp, inHops, inHops2, compileLiterals);
+		hop.resetVisitStatus();
+		
+		//reorder inputs (ensure matrix is first input, and other inputs ordered by size)
+		List<Hop> sinHops = inHops.stream()
+			.filter(h -> !(h.getDataType().isScalar() && tmp.get(h.getHopID()).isLiteral()))
+			.sorted(new HopInputComparator(inHops2.get("X"))).collect(Collectors.toList());
+		
+		//construct template node
+		ArrayList<CNode> inputs = new ArrayList<CNode>();
+		for( Hop in : sinHops )
+			inputs.add(tmp.get(in.getHopID()));
+		CNode output = tmp.get(hop.getHopID());
+		CNodeRow tpl = new CNodeRow(inputs, output);
+		tpl.setRowType(TemplateUtils.getRowType(hop, sinHops.get(0)));
+		tpl.setNumVectorIntermediates(TemplateUtils
+			.countVectorIntermediates(output));
+		
+		// return cplan instance
+		return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), tpl);
+	}
+
+	private void rConstructCplan(Hop hop, CPlanMemoTable memo, HashMap<Long, CNode> tmp, HashSet<Hop> inHops, HashMap<String, Hop> inHops2, boolean compileLiterals) 
+	{	
+		//memoization for common subexpression elimination and to avoid redundant work 
+		if( tmp.containsKey(hop.getHopID()) )
+			return;
+		
+		//recursively process required childs
+		MemoTableEntry me = memo.getBest(hop.getHopID(), TemplateType.RowTpl);
+		for( int i=0; i<hop.getInput().size(); i++ ) {
+			Hop c = hop.getInput().get(i);
+			if( me.isPlanRef(i) )
+				rConstructCplan(c, memo, tmp, inHops, inHops2, compileLiterals);
+			else {
+				CNodeData cdata = TemplateUtils.createCNodeData(c, compileLiterals);	
+				tmp.put(c.getHopID(), cdata);
+				inHops.add(c);
+			}
+		}
+		
+		//construct cnode for current hop
+		CNode out = null;
+		if(hop instanceof AggUnaryOp)
+		{
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			if( ((AggUnaryOp)hop).getDirection() == Direction.Row && HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
+				if(hop.getInput().get(0).getDim2()==1)
+					out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP_R);
+				else {
+					String opcode = "ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
+					out = new CNodeUnary(cdata1, UnaryType.valueOf(opcode));
+					inHops2.put("X", hop.getInput().get(0));
+				}
+			}
+			else  if (((AggUnaryOp)hop).getDirection() == Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM ) {
+				//vector add without temporary copy
+				if( cdata1 instanceof CNodeBinary && ((CNodeBinary)cdata1).getType().isVectorScalarPrimitive() )
+					out = new CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), 
+							((CNodeBinary)cdata1).getType().getVectorAddPrimitive());
+				else	
+					out = cdata1;
+			}
+		}
+		else if(hop instanceof AggBinaryOp)
+		{
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
+			
+			if( HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)) )
+			{
+				//correct input under transpose
+				cdata1 = TemplateUtils.skipTranspose(cdata1, hop.getInput().get(0), tmp, compileLiterals);
+				inHops.remove(hop.getInput().get(0)); 
+				inHops.add(hop.getInput().get(0).getInput().get(0));
+				
+				out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+			}
+			else
+			{
+				if(hop.getInput().get(0).getDim2()==1 && hop.getInput().get(1).getDim2()==1)
+					out = new CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new CNodeUnary(cdata1, UnaryType.LOOKUP0),
+						(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, UnaryType.LOOKUP0), BinType.MULT);
+				else {
+					out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
+					inHops2.put("X", hop.getInput().get(0));
+				}
+			}
+		}
+		else if(hop instanceof UnaryOp)
+		{
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			
+			// if one input is a matrix then we need to do vector by scalar operations
+			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 ) 
+			{
+				if( HopRewriteUtils.isUnary(hop, SUPPORTED_VECT_UNARY) ) {
+					String opname = "VECT_"+((UnaryOp)hop).getOp().name()+"_SCALAR";
+					out = new CNodeUnary(cdata1, UnaryType.valueOf(opname));
+				}
+				else 
+					throw new RuntimeException("Unsupported unary matrix "
+							+ "operation: " + ((UnaryOp)hop).getOp().name());
+			}
+			else //general scalar case
+			{
+				if( TemplateUtils.isColVector(cdata1) )
+					cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
+				else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
+					cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
+				
+				String primitiveOpName = ((UnaryOp)hop).getOp().toString();
+				out = new CNodeUnary(cdata1, UnaryType.valueOf(primitiveOpName));
+			}
+		}
+		else if(hop instanceof BinaryOp)
+		{
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
+			
+			// if one input is a matrix then we need to do vector by scalar operations
+			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 )
+			{
+				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
+					String opname = "VECT_"+((BinaryOp)hop).getOp().name()+"_SCALAR";
+					if( TemplateUtils.isColVector(cdata2) )
+						cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
+					out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
+				}
+				else 
+					throw new RuntimeException("Unsupported binary matrix "
+							+ "operation: " + ((BinaryOp)hop).getOp().name());
+			}
+			else //one input is a vector/scalar other is a scalar
+			{
+				String primitiveOpName = ((BinaryOp)hop).getOp().toString();
+				if( TemplateUtils.isColVector(cdata1) )
+					cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
+				if( TemplateUtils.isColVector(cdata2) )
+					cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
+				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));	
+			}
+		}
+		else if(hop instanceof TernaryOp) 
+		{
+			TernaryOp top = (TernaryOp) hop;
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
+			CNode cdata3 = tmp.get(hop.getInput().get(2).getHopID());
+			
+			//cdata1 is vector
+			if( TemplateUtils.isColVector(cdata1) )
+				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
+			else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
+				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
+			
+			//cdata3 is vector
+			if( TemplateUtils.isColVector(cdata3) )
+				cdata3 = new CNodeUnary(cdata3, UnaryType.LOOKUP_R);
+			else if( cdata3 instanceof CNodeData && hop.getInput().get(2).getDataType().isMatrix() )
+				cdata3 = new CNodeUnary(cdata3, UnaryType.LOOKUP_RC);
+			
+			//construct ternary cnode, primitive operation derived from OpOp3
+			out = new CNodeTernary(cdata1, cdata2, cdata3, 
+					TernaryType.valueOf(top.getOp().toString()));
+		}
+		else if( hop instanceof ParameterizedBuiltinOp ) 
+		{
+			CNode cdata1 = tmp.get(((ParameterizedBuiltinOp)hop).getTargetHop().getHopID());
+			if( TemplateUtils.isColVector(cdata1) )
+				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
+			else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
+				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
+			
+			CNode cdata2 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("pattern").getHopID());
+			CNode cdata3 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("replacement").getHopID());
+			TernaryType ttype = (cdata2.isLiteral() && cdata2.getVarname().equals("Double.NaN")) ? 
+					TernaryType.REPLACE_NAN : TernaryType.REPLACE;
+			out = new CNodeTernary(cdata1, cdata2, cdata3, ttype);
+		}
+		else if( hop instanceof IndexingOp ) 
+		{
+			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
+			out = new CNodeTernary(cdata1, 
+					TemplateUtils.createCNodeData(new LiteralOp(hop.getInput().get(0).getDim2()), true), 
+					TemplateUtils.createCNodeData(hop.getInput().get(4), true),
+					TernaryType.LOOKUP_RC1);
+		}
+		
+		if( out == null ) {
+			throw new RuntimeException(hop.getHopID()+" "+hop.getOpString());
+		}
+		
+		if( out.getDataType().isMatrix() ) {
+			out.setNumRows(hop.getDim1());
+			out.setNumCols(hop.getDim2());
+		}
+		
+		tmp.put(hop.getHopID(), out);
+	}
+	
+	/**
+	 * Comparator to order input hops of the row aggregate template. We try 
+	 * to order matrices-vectors-scalars via sorting by number of cells but 
+	 * we keep the given main input always at the first position.
+	 */
+	public static class HopInputComparator implements Comparator<Hop> 
+	{
+		private final Hop _X;
+		
+		public HopInputComparator(Hop X) {
+			_X = X;
+		}
+		
+		@Override
+		public int compare(Hop h1, Hop h2) {
+			long ncells1 = h1.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
+				(h1==_X) ? Long.MAX_VALUE : 
+				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-1;
+			long ncells2 = h2.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
+				(h2==_X) ? Long.MAX_VALUE : 
+				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-1;
+			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0; 
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
deleted file mode 100644
index 49d0cb8..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.template;
-
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.sysml.hops.AggBinaryOp;
-import org.apache.sysml.hops.AggUnaryOp;
-import org.apache.sysml.hops.BinaryOp;
-import org.apache.sysml.hops.Hop;
-import org.apache.sysml.hops.IndexingOp;
-import org.apache.sysml.hops.LiteralOp;
-import org.apache.sysml.hops.ParameterizedBuiltinOp;
-import org.apache.sysml.hops.TernaryOp;
-import org.apache.sysml.hops.UnaryOp;
-import org.apache.sysml.hops.codegen.cplan.CNode;
-import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
-import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
-import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
-import org.apache.sysml.hops.codegen.cplan.CNodeData;
-import org.apache.sysml.hops.codegen.cplan.CNodeRowAgg;
-import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
-import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
-import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
-import org.apache.sysml.hops.rewrite.HopRewriteUtils;
-import org.apache.sysml.hops.Hop.AggOp;
-import org.apache.sysml.hops.Hop.Direction;
-import org.apache.sysml.hops.Hop.OpOp2;
-import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.runtime.matrix.data.Pair;
-
-public class TemplateRowAgg extends TemplateBase 
-{
-	private static final Hop.AggOp[] SUPPORTED_ROW_AGG = new AggOp[]{AggOp.SUM, AggOp.MIN, AggOp.MAX};
-	private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new OpOp2[]{OpOp2.MULT, OpOp2.DIV, 
-			OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.LESS, OpOp2.LESSEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
-	
-	public TemplateRowAgg() {
-		super(TemplateType.RowAggTpl);
-	}
-	
-	public TemplateRowAgg(boolean closed) {
-		super(TemplateType.RowAggTpl, closed);
-	}
-	
-	@Override
-	public boolean open(Hop hop) {
-		return (hop instanceof AggBinaryOp && hop.getDim2()==1
-			&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
-			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
-				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1);
-	}
-
-	@Override
-	public boolean fuse(Hop hop, Hop input) {
-		return !isClosed() && 
-			(  (hop instanceof BinaryOp && (HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
-					|| HopRewriteUtils.isBinaryMatrixScalarOperation(hop)) ) 
-			|| ((hop instanceof UnaryOp || hop instanceof ParameterizedBuiltinOp) 
-					&& TemplateCell.isValidOperation(hop))		
-			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol)
-			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 
-				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
-	}
-
-	@Override
-	public boolean merge(Hop hop, Hop input) {
-		//merge rowagg tpl with cell tpl if input is a vector
-		return !isClosed() &&
-			((hop instanceof BinaryOp && input.getDim2()==1) //matrix-scalar/vector-vector ops )
-			 ||(hop instanceof AggBinaryOp && input.getDim2()==1
-				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
-	}
-
-	@Override
-	public CloseType close(Hop hop) {
-		//close on column aggregate (e.g., colSums, t(X)%*%y)
-		if( hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()==Direction.Col
-			|| (hop instanceof AggBinaryOp && HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))) )
-			return CloseType.CLOSED_VALID;
-		else
-			return CloseType.OPEN;
-	}
-
-	@Override
-	public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable memo, boolean compileLiterals) {
-		//recursively process required cplan output
-		HashSet<Hop> inHops = new HashSet<Hop>();
-		HashMap<String, Hop> inHops2 = new HashMap<String,Hop>();
-		HashMap<Long, CNode> tmp = new HashMap<Long, CNode>();
-		hop.resetVisitStatus();
-		rConstructCplan(hop, memo, tmp, inHops, inHops2, compileLiterals);
-		hop.resetVisitStatus();
-		
-		//reorder inputs (ensure matrix is first input, and other inputs ordered by size)
-		List<Hop> sinHops = inHops.stream()
-			.filter(h -> !(h.getDataType().isScalar() && tmp.get(h.getHopID()).isLiteral()))
-			.sorted(new HopInputComparator(inHops2.get("X"))).collect(Collectors.toList());
-		
-		//construct template node
-		ArrayList<CNode> inputs = new ArrayList<CNode>();
-		for( Hop in : sinHops )
-			inputs.add(tmp.get(in.getHopID()));
-		CNode output = tmp.get(hop.getHopID());
-		CNodeRowAgg tpl = new CNodeRowAgg(inputs, output);
-		tpl.setNumVectorIntermediates(TemplateUtils
-			.countVectorIntermediates(output));
-		
-		// return cplan instance
-		return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), tpl);
-	}
-
-	private void rConstructCplan(Hop hop, CPlanMemoTable memo, HashMap<Long, CNode> tmp, HashSet<Hop> inHops, HashMap<String, Hop> inHops2, boolean compileLiterals) 
-	{	
-		//memoization for common subexpression elimination and to avoid redundant work 
-		if( tmp.containsKey(hop.getHopID()) )
-			return;
-		
-		//recursively process required childs
-		MemoTableEntry me = memo.getBest(hop.getHopID(), TemplateType.RowAggTpl);
-		for( int i=0; i<hop.getInput().size(); i++ ) {
-			Hop c = hop.getInput().get(i);
-			if( me.isPlanRef(i) )
-				rConstructCplan(c, memo, tmp, inHops, inHops2, compileLiterals);
-			else {
-				CNodeData cdata = TemplateUtils.createCNodeData(c, compileLiterals);	
-				tmp.put(c.getHopID(), cdata);
-				inHops.add(c);
-			}
-		}
-		
-		//construct cnode for current hop
-		CNode out = null;
-		if(hop instanceof AggUnaryOp)
-		{
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			if( ((AggUnaryOp)hop).getDirection() == Direction.Row && HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
-				if(hop.getInput().get(0).getDim2()==1)
-					out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP_R);
-				else {
-					String opcode = "ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
-					out = new CNodeUnary(cdata1, UnaryType.valueOf(opcode));
-					inHops2.put("X", hop.getInput().get(0));
-				}
-			}
-			else  if (((AggUnaryOp)hop).getDirection() == Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM ) {
-				//vector add without temporary copy
-				if( cdata1 instanceof CNodeBinary && ((CNodeBinary)cdata1).getType().isVectorScalarPrimitive() )
-					out = new CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), 
-							((CNodeBinary)cdata1).getType().getVectorAddPrimitive());
-				else	
-					out = cdata1;
-			}
-		}
-		else if(hop instanceof AggBinaryOp)
-		{
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
-			
-			if( HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)) )
-			{
-				//correct input under transpose
-				cdata1 = TemplateUtils.skipTranspose(cdata1, hop.getInput().get(0), tmp, compileLiterals);
-				inHops.remove(hop.getInput().get(0)); 
-				inHops.add(hop.getInput().get(0).getInput().get(0));
-				
-				out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
-			}
-			else
-			{
-				if(hop.getInput().get(0).getDim2()==1 && hop.getInput().get(1).getDim2()==1)
-					out = new CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new CNodeUnary(cdata1, UnaryType.LOOKUP0),
-						(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, UnaryType.LOOKUP0), BinType.MULT);
-				else {
-					out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
-					inHops2.put("X", hop.getInput().get(0));
-				}
-			}
-		}
-		else if(hop instanceof UnaryOp)
-		{
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			if( TemplateUtils.isColVector(cdata1) )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
-			else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
-			
-			String primitiveOpName = ((UnaryOp)hop).getOp().toString();
-			out = new CNodeUnary(cdata1, UnaryType.valueOf(primitiveOpName));
-		}
-		else if(hop instanceof BinaryOp)
-		{
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
-			
-			// if one input is a matrix then we need to do vector by scalar operations
-			if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 )
-			{
-				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
-					String opname = "VECT_"+((BinaryOp)hop).getOp().name()+"_SCALAR";
-					if( TemplateUtils.isColVector(cdata2) )
-						cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
-					out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
-				}
-				else 
-					throw new RuntimeException("Unsupported binary matrix "
-							+ "operation: " + ((BinaryOp)hop).getOp().name());
-			}
-			else //one input is a vector/scalar other is a scalar
-			{
-				String primitiveOpName = ((BinaryOp)hop).getOp().toString();
-				if( TemplateUtils.isColVector(cdata1) )
-					cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
-				if( TemplateUtils.isColVector(cdata2) )
-					cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP_R);
-				out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));	
-			}
-		}
-		else if(hop instanceof TernaryOp) 
-		{
-			TernaryOp top = (TernaryOp) hop;
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			CNode cdata2 = tmp.get(hop.getInput().get(1).getHopID());
-			CNode cdata3 = tmp.get(hop.getInput().get(2).getHopID());
-			
-			//cdata1 is vector
-			if( TemplateUtils.isColVector(cdata1) )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
-			else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
-			
-			//cdata3 is vector
-			if( TemplateUtils.isColVector(cdata3) )
-				cdata3 = new CNodeUnary(cdata3, UnaryType.LOOKUP_R);
-			else if( cdata3 instanceof CNodeData && hop.getInput().get(2).getDataType().isMatrix() )
-				cdata3 = new CNodeUnary(cdata3, UnaryType.LOOKUP_RC);
-			
-			//construct ternary cnode, primitive operation derived from OpOp3
-			out = new CNodeTernary(cdata1, cdata2, cdata3, 
-					TernaryType.valueOf(top.getOp().toString()));
-		}
-		else if( hop instanceof ParameterizedBuiltinOp ) 
-		{
-			CNode cdata1 = tmp.get(((ParameterizedBuiltinOp)hop).getTargetHop().getHopID());
-			if( TemplateUtils.isColVector(cdata1) )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_R);
-			else if( cdata1 instanceof CNodeData && hop.getInput().get(0).getDataType().isMatrix() )
-				cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP_RC);
-			
-			CNode cdata2 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("pattern").getHopID());
-			CNode cdata3 = tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("replacement").getHopID());
-			TernaryType ttype = (cdata2.isLiteral() && cdata2.getVarname().equals("Double.NaN")) ? 
-					TernaryType.REPLACE_NAN : TernaryType.REPLACE;
-			out = new CNodeTernary(cdata1, cdata2, cdata3, ttype);
-		}
-		else if( hop instanceof IndexingOp ) 
-		{
-			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
-			out = new CNodeTernary(cdata1, 
-					TemplateUtils.createCNodeData(new LiteralOp(hop.getInput().get(0).getDim2()), true), 
-					TemplateUtils.createCNodeData(hop.getInput().get(4), true),
-					TernaryType.LOOKUP_RC1);
-		}
-		
-		if( out == null ) {
-			throw new RuntimeException(hop.getHopID()+" "+hop.getOpString());
-		}
-		
-		if( out.getDataType().isMatrix() ) {
-			out.setNumRows(hop.getDim1());
-			out.setNumCols(hop.getDim2());
-		}
-		
-		tmp.put(hop.getHopID(), out);
-	}
-	
-	/**
-	 * Comparator to order input hops of the row aggregate template. We try 
-	 * to order matrices-vectors-scalars via sorting by number of cells but 
-	 * we keep the given main input always at the first position.
-	 */
-	public static class HopInputComparator implements Comparator<Hop> 
-	{
-		private final Hop _X;
-		
-		public HopInputComparator(Hop X) {
-			_X = X;
-		}
-		
-		@Override
-		public int compare(Hop h1, Hop h2) {
-			long ncells1 = h1.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h1==_X) ? Long.MAX_VALUE : 
-				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-1;
-			long ncells2 = h2.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h2==_X) ? Long.MAX_VALUE : 
-				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-1;
-			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0; 
-		}
-	}
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index e8d2086..502e0ef 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -43,6 +43,7 @@ import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
@@ -50,11 +51,12 @@ import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
 import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;
+import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class TemplateUtils 
 {
-	public static final TemplateBase[] TEMPLATES = new TemplateBase[]{new TemplateRowAgg(), new TemplateCell(), new TemplateOuterProduct()};
+	public static final TemplateBase[] TEMPLATES = new TemplateBase[]{new TemplateRow(), new TemplateCell(), new TemplateOuterProduct()};
 	
 	public static boolean isVector(Hop hop) {
 		return (hop.getDataType() == DataType.MATRIX 
@@ -99,6 +101,13 @@ public class TemplateUtils
 			&& left.getDataType().isMatrix() && right.getDataType().isMatrix()
 			&& left.getDim2() > right.getDim2();
 	}
+	
+	public static boolean hasMatrixInput( Hop hop ) {
+		for( Hop c : hop.getInput() )
+			if( isMatrix(c) )
+				return true;
+		return false;
+	}
 
 	public static boolean isOperationSupported(Hop h) {
 		if(h instanceof  UnaryOp)
@@ -187,7 +196,7 @@ public class TemplateUtils
 		TemplateBase tpl = null;
 		switch( type ) {
 			case CellTpl: tpl = new TemplateCell(closed); break;
-			case RowAggTpl: tpl = new TemplateRowAgg(closed); break;
+			case RowTpl: tpl = new TemplateRow(closed); break;
 			case MultiAggTpl: tpl = new TemplateMultiAgg(closed); break;
 			case OuterProdTpl: tpl = new TemplateOuterProduct(closed); break;
 		}
@@ -197,8 +206,8 @@ public class TemplateUtils
 	public static TemplateBase[] createCompatibleTemplates(TemplateType type, boolean closed) {
 		TemplateBase[] tpl = null;
 		switch( type ) {
-			case CellTpl: tpl = new TemplateBase[]{new TemplateCell(closed), new TemplateRowAgg(closed)}; break;
-			case RowAggTpl: tpl = new TemplateBase[]{new TemplateRowAgg(closed)}; break;
+			case CellTpl: tpl = new TemplateBase[]{new TemplateCell(closed), new TemplateRow(closed)}; break;
+			case RowTpl: tpl = new TemplateBase[]{new TemplateRow(closed)}; break;
 			case MultiAggTpl: tpl = new TemplateBase[]{new TemplateMultiAgg(closed)}; break;
 			case OuterProdTpl: tpl = new TemplateBase[]{new TemplateOuterProduct(closed)}; break;
 		}
@@ -211,6 +220,17 @@ public class TemplateUtils
 			CellType.FULL_AGG : CellType.ROW_AGG) : CellType.NO_AGG;
 	}
 	
+	public static RowType getRowType(Hop output, Hop input) {
+		if( HopRewriteUtils.isEqualSize(output, input) )
+			return RowType.NO_AGG;
+		else if( output.getDim1()==input.getDim1() && output.getDim2()==1 )
+			return RowType.ROW_AGG;
+		else if( output.getDim1()==input.getDim2() && output.getDim2()==1 )
+			return RowType.COL_AGG_T;
+		else
+			return RowType.COL_AGG;
+	}
+	
 	public static AggOp getAggOp(Hop hop) {
 		return (hop instanceof AggUnaryOp) ? ((AggUnaryOp)hop).getOp() :
 			(hop instanceof AggBinaryOp) ? AggOp.SUM : null;
@@ -292,11 +312,36 @@ public class TemplateUtils
 		int ret = 0;
 		for( CNode c : node.getInput() )
 			ret += countVectorIntermediates(c);
-		return ret + ((node instanceof CNodeBinary 
+		int cntBin = ((node instanceof CNodeBinary 
 			&& ((CNodeBinary)node).getType().isVectorScalarPrimitive()) ? 1 : 0);
+		int cntUn = ((node instanceof CNodeUnary
+				&& ((CNodeUnary)node).getType().isVectorScalarPrimitive()) ? 1 : 0);
+		return ret + cntBin + cntUn;
 	}
 
 	public static boolean isType(TemplateType type, TemplateType... validTypes) {
 		return ArrayUtils.contains(validTypes, type);
 	}
+	
+	public static boolean hasCommonRowTemplateMatrixInput(Hop input1, Hop input2, CPlanMemoTable memo) {
+		//if second input has no row template, it's always true
+		if( !memo.contains(input2.getHopID(), TemplateType.RowTpl) )
+			return true;
+		//check for common row template input
+		return getRowTemplateMatrixInput(input1, memo)
+			== getRowTemplateMatrixInput(input2, memo);
+	}
+	
+	public static long getRowTemplateMatrixInput(Hop current, CPlanMemoTable memo) {
+		MemoTableEntry me = memo.getBest(current.getHopID(), TemplateType.RowTpl);
+		long ret = -1;
+		for( int i=0; ret<0 && i<current.getInput().size(); i++ ) {
+			Hop input = current.getInput().get(i);
+			if( me.isPlanRef(i) && memo.contains(input.getHopID(), TemplateType.RowTpl) )
+				ret = getRowTemplateMatrixInput(input, memo);
+			else if( !me.isPlanRef(i) && isMatrix(input) )
+				ret = input.getHopID();
+		}
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index a4b6ec1..0bad2f6 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -792,13 +792,8 @@ public class HopRewriteUtils
 	}
 	
 	public static boolean isBinary(Hop hop, OpOp2... types) {
-		if( hop instanceof BinaryOp ) {
-			BinaryOp bop = (BinaryOp) hop;
-			for( OpOp2 type : types )
-				if( type == bop.getOp() )
-					return true;
-		}
-		return false;
+		return ( hop instanceof BinaryOp 
+			&& ArrayUtils.contains(types, ((BinaryOp) hop).getOp()));
 	}
 	
 	public static boolean isBinary(Hop hop, OpOp2 type, int maxParents) {
@@ -832,6 +827,11 @@ public class HopRewriteUtils
 		return isUnary(hop, type) && hop.getParent().size() <= maxParents;
 	}
 	
+	public static boolean isUnary(Hop hop, OpOp1... types) {
+		return ( hop instanceof UnaryOp 
+			&& ArrayUtils.contains(types, ((UnaryOp) hop).getOp()));
+	}
+	
 	public static boolean isMatrixMultiply(Hop hop) {
 		return hop instanceof AggBinaryOp && ((AggBinaryOp)hop).isMatrixMultiply();
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 6907b0b..9283c46 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -22,6 +22,7 @@ package org.apache.sysml.runtime.codegen;
 import java.util.Arrays;
 import java.util.LinkedList;
 
+import org.apache.commons.math3.util.FastMath;
 import org.apache.sysml.runtime.functionobjects.IntegerDivide;
 import org.apache.sysml.runtime.functionobjects.Modulus;
 import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
@@ -73,6 +74,10 @@ public class LibSpoofPrimitives
 		LibMatrixMult.vectMultiplyAdd(bval, a, c, bix, bi, 0, len);
 		return c;
 	}
+	
+	public static void vectWrite(double[] a, double[] c, int ci, int len) {
+		System.arraycopy(a, 0, c, ci, len);
+	}
 
 	// custom vector sums, mins, maxs
 	
@@ -113,32 +118,14 @@ public class LibSpoofPrimitives
 	 * @return sum value
 	 */
 	public static double vectSum(double[] avals, int[] aix, int ai, int len) {
-		double val = 0;
-		final int bn = len%8;
-				
-		//compute rest
-		for( int i = ai; i < ai+bn; i++ )
-			val += avals[ aix[i] ];
-		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int i = ai+bn; i < ai+len; i+=8 )
-		{
-			//read 64B of a via 'gather'
-			//compute cval' = sum(a) + cval
-			val += avals[ aix[i+0] ] + avals[ aix[i+1] ]
-			     + avals[ aix[i+2] ] + avals[ aix[i+3] ]
-			     + avals[ aix[i+4] ] + avals[ aix[i+5] ]
-			     + avals[ aix[i+6] ] + avals[ aix[i+7] ];
-		}
-		
-		//scalar result
-		return val; 
+		//forward to dense as column indexes not required here
+		return vectSum(avals, ai, len);
 	}
 	
 	public static double vectMin(double[] a, int ai, int len) { 
 		double val = Double.MAX_VALUE;
 		for( int i = ai; i < ai+len; i++ )
-			val = Math.min(a[ai], val);
+			val = Math.min(a[i], val);
 		return val; 
 	} 
 	
@@ -152,7 +139,7 @@ public class LibSpoofPrimitives
 	public static double vectMax(double[] a, int ai, int len) { 
 		double val = -Double.MAX_VALUE;
 		for( int i = ai; i < ai+len; i++ )
-			val = Math.max(a[ai], val);
+			val = Math.max(a[i], val);
 		return val; 
 	} 
 	
@@ -189,6 +176,84 @@ public class LibSpoofPrimitives
 		return c;
 	}
 	
+	//custom vector minus
+	
+	public static void vectMinusAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  a[j] - bval;
+	} 
+
+	public static void vectMinusAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += a[j] - bval;
+	}
+	
+	public static double[] vectMinusWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = a[ai] - bval;
+		return c;
+	}
+
+	public static double[] vectMinusWrite(double[] a, double bval, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = a[j] - bval;
+		return c;
+	}
+
+	//custom exp
+	
+	public static void vectExpAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  FastMath.exp(a[j]);
+	} 
+
+	public static void vectExpAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += FastMath.exp(a[j]);
+	}
+	
+	public static double[] vectExpWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = FastMath.exp(a[ai]);
+		return c;
+	}
+
+	public static double[] vectExpWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = FastMath.exp(a[j]);
+		return c;
+	}
+
+	//custom log
+	
+	public static void vectLogAdd(double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  FastMath.log(a[j]);
+	} 
+
+	public static void vectLogAdd(double[] a, double[] c, int[] aix, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++ )
+			c[ci + aix[j]] += FastMath.log(a[j]);
+	}
+	
+	public static double[] vectLogWrite(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++, ai++)
+			c[j] = FastMath.log(a[ai]);
+		return c;
+	}
+
+	public static double[] vectLogWrite(double[] a, int[] aix, int ai, int len) {
+		double[] c = allocVector(len, true);
+		for( int j = ai; j < ai+len; j++ )
+			c[aix[j]] = FastMath.log(a[j]);
+		return c;
+	}
+	
 	//custom vector equal
 	
 	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
deleted file mode 100644
index 0224b9a..0000000
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.runtime.codegen;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.instructions.cp.ScalarObject;
-import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlock;
-import org.apache.sysml.runtime.util.UtilFunctions;
-
-
-public abstract class SpoofRowAggregate extends SpoofOperator
-{
-	private static final long serialVersionUID = 6242910797139642998L;
-	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
-	
-	protected final boolean _colVector;
-	protected final int _reqVectMem;
-	
-	public SpoofRowAggregate(boolean colVector, int reqVectMem) {
-		_colVector = colVector;
-		_reqVectMem = reqVectMem;
-	}
-
-	@Override
-	public String getSpoofType() {
-		return "RA" +  getClass().getName().split("\\.")[1];
-	}
-	
-	@Override
-	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out)	
-		throws DMLRuntimeException
-	{
-		//sanity check
-		if( inputs==null || inputs.size() < 1 || out==null )
-			throw new RuntimeException("Invalid input arguments.");
-		
-		//result allocation and preparations
-		out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
-			_colVector ? 1 : inputs.get(0).getNumColumns(), false);
-		out.allocateDenseBlock();
-		double[] c = out.getDenseBlock();
-		
-		//input preparation
-		double[][] b = prepInputMatrices(inputs);
-		double[] scalars = prepInputScalars(scalarObjects);
-		
-		//core sequential execute
-		final int m = inputs.get(0).getNumRows();
-		final int n = inputs.get(0).getNumColumns();		
-		LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, out.getNumColumns());
-		if( !inputs.get(0).isInSparseFormat() )
-			executeDense(inputs.get(0).getDenseBlock(), b, scalars, c, n, 0, m);
-		else
-			executeSparse(inputs.get(0).getSparseBlock(), b, scalars, c, n, 0, m);
-	
-		//post-processing
-		LibSpoofPrimitives.cleanupThreadLocalMemory();
-		out.recomputeNonZeros();	
-	}
-	
-	@Override
-	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)	
-		throws DMLRuntimeException
-	{
-		//redirect to serial execution
-		if( k <= 1 || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
-			execute(inputs, scalarObjects, out);
-			return;
-		}
-		
-		//sanity check
-		if( inputs==null || inputs.size() < 1 || out==null )
-			throw new RuntimeException("Invalid input arguments.");
-		
-		//result allocation and preparations
-		out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
-			_colVector ? 1 : inputs.get(0).getNumColumns(), false);
-		out.allocateDenseBlock();
-		
-		//input preparation
-		double[][] b = prepInputMatrices(inputs);
-		double[] scalars = prepInputScalars(scalarObjects);
-		
-		//core parallel execute
-		final int m = inputs.get(0).getNumRows();
-		final int n = inputs.get(0).getNumColumns();		
-		try {
-			ExecutorService pool = Executors.newFixedThreadPool( k );
-			ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
-			int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
-			int blklen = (int)(Math.ceil((double)m/nk));
-			for( int i=0; i<nk & i*blklen<m; i++ )
-				tasks.add(new ParExecTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
-			//execute tasks
-			List<Future<double[]>> taskret = pool.invokeAll(tasks);	
-			pool.shutdown();
-			//aggregate partial results
-			for( Future<double[]> task : taskret )
-				LibMatrixMult.vectAdd(task.get(), out.getDenseBlock(), 0, 0, n);
-		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
-		}
-		
-		//post-processing
-		out.recomputeNonZeros();	
-	}
-	
-	private void executeDense(double[] a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
-	{
-		if( a == null )
-			return;
-		
-		for( int i=rl, aix=rl*n; i<ru; i++, aix+=n ) {
-			//call generated method
-			genexecRowDense( a, aix, b, scalars, c, n, i );
-		}
-	}
-	
-	private void executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
-	{
-		if( sblock == null )
-			return;
-			
-		for( int i=rl; i<ru; i++ ) {
-			if( !sblock.isEmpty(i) ) {
-				double[] avals = sblock.values(i);
-				int[] aix = sblock.indexes(i);
-				int apos = sblock.pos(i);
-				int alen = sblock.size(i);
-				
-				//call generated method
-				genexecRowSparse(avals, aix, apos, b, scalars, c, alen, i);
-			}
-		}
-	}
-	
-	//methods to be implemented by generated operators of type SpoofRowAggrgate 
-	
-	protected abstract void genexecRowDense( double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
-	
-	protected abstract void genexecRowSparse( double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
-
-	
-	/**
-	 * Task for multi-threaded operations.
-	 */
-	private class ParExecTask implements Callable<double[]> 
-	{
-		private final MatrixBlock _a;
-		private final double[][] _b;
-		private final double[] _scalars;
-		private final int _clen;
-		private final int _rl;
-		private final int _ru;
-
-		protected ParExecTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rl, int ru ) {
-			_a = a;
-			_b = b;
-			_scalars = scalars;
-			_clen = clen;
-			_rl = rl;
-			_ru = ru;
-		}
-		
-		@Override
-		public double[] call() throws DMLRuntimeException {
-			
-			//allocate vector intermediates and partial output
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
-			double[] c = new double[_clen];
-			
-			if( !_a.isInSparseFormat() )
-				executeDense(_a.getDenseBlock(), _b, _scalars, c, _clen, _rl, _ru);
-			else
-				executeSparse(_a.getSparseBlock(), _b, _scalars, c, _clen, _rl, _ru);
-			
-			LibSpoofPrimitives.cleanupThreadLocalMemory();
-			return c;
-		}
-	}
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
new file mode 100644
index 0000000..b100a89
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+
+public abstract class SpoofRowwise extends SpoofOperator
+{
+	private static final long serialVersionUID = 6242910797139642998L;
+	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
+	
+	public enum RowType {
+		NO_AGG,    //no aggregation
+		ROW_AGG,   //row aggregation (e.g., rowSums() or X %*% v)
+		COL_AGG,   //col aggregation (e.g., colSums() or t(y) %*% X)
+		COL_AGG_T; //transposed col aggregation (e.g., t(X) %*% y)
+		
+		public boolean isColumnAgg() {
+			return (this == COL_AGG || this == COL_AGG_T);
+		}
+	}
+	
+	protected final RowType _type;
+	protected final int _reqVectMem;
+	
+	public SpoofRowwise(RowType type, int reqVectMem) {
+		_type = type;
+		_reqVectMem = reqVectMem;
+	}
+	
+	public RowType getRowType() {
+		return _type;
+	}
+
+	@Override
+	public String getSpoofType() {
+		return "RA" +  getClass().getName().split("\\.")[1];
+	}
+	
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 1 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//result allocation and preparations
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();
+		allocateOutputMatrix(m, n, out);
+		double[] c = out.getDenseBlock();
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		
+		LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n);
+		if( !inputs.get(0).isInSparseFormat() )
+			executeDense(inputs.get(0).getDenseBlock(), b, scalars, c, n, 0, m);
+		else
+			executeSparse(inputs.get(0).getSparseBlock(), b, scalars, c, n, 0, m);
+	
+		//post-processing
+		LibSpoofPrimitives.cleanupThreadLocalMemory();
+		out.recomputeNonZeros();
+		out.examSparsity();
+	}
+	
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)	
+		throws DMLRuntimeException
+	{
+		//redirect to serial execution
+		if( k <= 1 || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
+			execute(inputs, scalarObjects, out);
+			return;
+		}
+		
+		//sanity check
+		if( inputs==null || inputs.size() < 1 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//result allocation and preparations
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();
+		allocateOutputMatrix(m, n, out);
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core parallel execute
+		ExecutorService pool = Executors.newFixedThreadPool( k );
+		int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+		int blklen = (int)(Math.ceil((double)m/nk));
+		try
+		{
+			if( _type.isColumnAgg() ) {
+				//execute tasks
+				ArrayList<ParColAggTask> tasks = new ArrayList<ParColAggTask>();
+				for( int i=0; i<nk & i*blklen<m; i++ )
+					tasks.add(new ParColAggTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+				List<Future<double[]>> taskret = pool.invokeAll(tasks);	
+				//aggregate partial results
+				for( Future<double[]> task : taskret )
+					LibMatrixMult.vectAdd(task.get(), out.getDenseBlock(), 0, 0, n);
+				out.recomputeNonZeros();
+			}
+			else {
+				//execute tasks
+				ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
+				for( int i=0; i<nk & i*blklen<m; i++ )
+					tasks.add(new ParExecTask(inputs.get(0), b, out, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+				List<Future<Long>> taskret = pool.invokeAll(tasks);
+				//aggregate nnz, no need to aggregate results
+				long nnz = 0;
+				for( Future<Long> task : taskret )
+					nnz += task.get();
+				out.setNonZeros(nnz);
+			}
+			
+			pool.shutdown();
+			out.examSparsity();
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}	
+	}
+	
+	private void allocateOutputMatrix(int m, int n, MatrixBlock out) {
+		switch( _type ) {
+			case NO_AGG: out.reset(m, n, false); break;
+			case ROW_AGG: out.reset(m, 1, false); break;
+			case COL_AGG: out.reset(1, n, false); break;
+			case COL_AGG_T: out.reset(n, 1, false); break;
+		}
+		out.allocateDenseBlock();
+	}
+	
+	private void executeDense(double[] a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	{
+		if( a == null )
+			return;
+		
+		for( int i=rl, aix=rl*n; i<ru; i++, aix+=n ) {
+			//call generated method
+			genexecRowDense( a, aix, b, scalars, c, n, i );
+		}
+	}
+	
+	private void executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	{
+		if( sblock == null )
+			return;
+			
+		for( int i=rl; i<ru; i++ ) {
+			if( !sblock.isEmpty(i) ) {
+				double[] avals = sblock.values(i);
+				int[] aix = sblock.indexes(i);
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				
+				//call generated method
+				genexecRowSparse(avals, aix, apos, b, scalars, c, alen, i);
+			}
+		}
+	}
+	
+	//methods to be implemented by generated operators of type SpoofRowAggrgate 
+	
+	protected abstract void genexecRowDense( double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
+	
+	protected abstract void genexecRowSparse( double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
+
+	
+	/**
+	 * Task for multi-threaded column aggregation operations.
+	 */
+	private class ParColAggTask implements Callable<double[]> 
+	{
+		private final MatrixBlock _a;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final int _clen;
+		private final int _rl;
+		private final int _ru;
+
+		protected ParColAggTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rl, int ru ) {
+			_a = a;
+			_b = b;
+			_scalars = scalars;
+			_clen = clen;
+			_rl = rl;
+			_ru = ru;
+		}
+		
+		@Override
+		public double[] call() throws DMLRuntimeException {
+			
+			//allocate vector intermediates and partial output
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
+			double[] c = new double[_clen];
+			
+			if( !_a.isInSparseFormat() )
+				executeDense(_a.getDenseBlock(), _b, _scalars, c, _clen, _rl, _ru);
+			else
+				executeSparse(_a.getSparseBlock(), _b, _scalars, c, _clen, _rl, _ru);
+			
+			LibSpoofPrimitives.cleanupThreadLocalMemory();
+			return c;
+		}
+	}
+	
+	/**
+	 * Task for multi-threaded execution with no or row aggregation.
+	 */
+	private class ParExecTask implements Callable<Long> 
+	{
+		private final MatrixBlock _a;
+		private final double[][] _b;
+		private final MatrixBlock _c;
+		private final double[] _scalars;
+		private final int _clen;
+		private final int _rl;
+		private final int _ru;
+
+		protected ParExecTask( MatrixBlock a, double[][] b, MatrixBlock c, double[] scalars, int clen, int rl, int ru ) {
+			_a = a;
+			_b = b;
+			_c = c;
+			_scalars = scalars;
+			_clen = clen;
+			_rl = rl;
+			_ru = ru;
+		}
+		
+		@Override
+		public Long call() throws DMLRuntimeException {
+			//allocate vector intermediates
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
+			
+			if( !_a.isInSparseFormat() )
+				executeDense(_a.getDenseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
+			else
+				executeSparse(_a.getSparseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
+			LibSpoofPrimitives.cleanupThreadLocalMemory();
+			
+			//maintain nnz for row partition
+			return _c.recomputeNonZeros(_rl, _ru-1, 0, _c.getNumColumns()-1);
+		}
+	}
+}