You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/06/30 06:13:28 UTC

[2/2] systemml git commit: [SYSTEMML-1535] Codegen matrix-matrix multiplication support

[SYSTEMML-1535] Codegen matrix-matrix multiplication support

This patch generalizes the row-wise code generation template from
matrix-vector to matrix-matrix multiplications, which enables a broad
range of additional fusion opportunities. Examples are Mlogreg and
KMeans with multiple classes or centroids, respectively. The fusion of
matrix-matrix multiplications avoids unnecessary scans of X as well as
large intermediates of size nrow(X) x K. 

On a scenario of KMeans w/ 1 run, 20 iterations, 100M x 10 dense input,
and 5 centroids, this change improved the end-to-end performance from
852s (1360s w/o codegen) to 463s. The major additional benefits come
from fusing (1) -2 * (X %*% t(C)) + t(rowSums (C ^ 2)), and (2) (t(P)
%*% X), which avoid two large intermediates for X %*% t(C) and t(P).

Furthermore, this patch also lays the foundations for more complex dags
with different vector sizes in row-wise templates.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6b25b3bf
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6b25b3bf
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6b25b3bf

Branch: refs/heads/master
Commit: 6b25b3bf2621f13d97c6a3bf3a66a333af834db7
Parents: 6a4aa1d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Jun 29 22:38:03 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Jun 29 23:14:05 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |   2 +-
 .../apache/sysml/hops/codegen/SpoofFusedOp.java |  15 ++-
 .../apache/sysml/hops/codegen/cplan/CNode.java  |  10 ++
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  79 +++++++++---
 .../hops/codegen/cplan/CNodeOuterProduct.java   |   6 +-
 .../sysml/hops/codegen/cplan/CNodeRow.java      |  58 +++++----
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |   4 +
 .../hops/codegen/template/TemplateRow.java      |  84 +++++++++---
 .../hops/codegen/template/TemplateUtils.java    | 127 +++++++------------
 .../runtime/codegen/LibSpoofPrimitives.java     |  83 ++++++++++--
 .../sysml/runtime/codegen/SpoofCellwise.java    |   4 +-
 .../runtime/codegen/SpoofMultiAggregate.java    |   2 +-
 .../sysml/runtime/codegen/SpoofOperator.java    | 105 +++++++--------
 .../runtime/codegen/SpoofOuterProduct.java      |  16 +--
 .../sysml/runtime/codegen/SpoofRowwise.java     |  77 ++++++-----
 .../instructions/spark/SpoofSPInstruction.java  |   3 +-
 .../spark/data/PartitionedBroadcast.java        |   8 ++
 .../runtime/matrix/data/LibMatrixMult.java      |  33 ++---
 .../functions/codegen/RowAggTmplTest.java       |  66 +++++++++-
 .../scripts/functions/codegen/rowAggPattern24.R |  33 +++++
 .../functions/codegen/rowAggPattern24.dml       |  30 +++++
 .../scripts/functions/codegen/rowAggPattern25.R |  32 +++++
 .../functions/codegen/rowAggPattern25.dml       |  29 +++++
 .../scripts/functions/codegen/rowAggPattern26.R |  32 +++++
 .../functions/codegen/rowAggPattern26.dml       |  28 ++++
 .../scripts/functions/codegen/rowAggPattern27.R |  32 +++++
 .../functions/codegen/rowAggPattern27.dml       |  29 +++++
 27 files changed, 747 insertions(+), 280 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index fede282..5342c09 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -96,7 +96,7 @@ import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Statistics;
 
-public class SpoofCompiler 
+public class SpoofCompiler
 {
 	private static final Log LOG = LogFactory.getLog(SpoofCompiler.class.getName());
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
index 06be99b..0d4b8db 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
@@ -44,8 +44,9 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 		COLUMN_DIMS_COLS,
 		SCALAR,
 		MULTI_SCALAR,
-		ROW_RANK_DIMS, // right wdivmm 
-		COLUMN_RANK_DIMS  // left wdivmm
+		ROW_RANK_DIMS, // right wdivmm, row mm
+		COLUMN_RANK_DIMS,  // left wdivmm, row mm
+		COLUMN_RANK_DIMS_T;
 	}
 	
 	private Class<?> _class = null;
@@ -182,6 +183,12 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 						ret = new long[]{mc.getCols(), mc2.getCols(), -1};
 					break;
 				}
+				case COLUMN_RANK_DIMS_T: {
+					MatrixCharacteristics mc2 = memo.getAllInputStats(getInput().get(1));
+					if( mc2.dimsKnown() )
+						ret = new long[]{mc2.getCols(), mc.getCols(), -1};
+					break;
+				}
 				default:
 					throw new RuntimeException("Failed to infer worst-case size information "
 							+ "for type: "+_dimsType.toString());
@@ -231,6 +238,10 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 				setDim1(getInput().get(0).getDim2());
 				setDim2(getInput().get(1).getDim2());
 				break;
+			case COLUMN_RANK_DIMS_T:
+				setDim1(getInput().get(1).getDim2());
+				setDim2(getInput().get(0).getDim2());
+				break;	
 			default:
 				throw new RuntimeException("Failed to refresh size information "
 						+ "for type: "+_dimsType.toString());

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
index efe468e..1f91697 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
@@ -83,6 +83,16 @@ public abstract class CNode
 		return _genVar;
 	}
 	
+	public String getVectorLength() {
+		if( getVarname().startsWith("a") )
+			return "len";
+		else if( getVarname().startsWith("b") )
+			return getVarname()+".clen";
+		else if( _dataType==DataType.MATRIX )
+			return getVarname()+".length";
+		return "";
+	}
+	
 	public String getClassname() {
 		return getVarname();
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index 8d67f26..4bbf205 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -28,7 +28,8 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public class CNodeBinary extends CNode
 {
 	public enum BinType {
-		DOT_PRODUCT,
+		//matrix multiplication operations
+		DOT_PRODUCT, VECT_MATRIXMULT, VECT_OUTERMULT_ADD,
 		//vector-scalar-add operations
 		VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD, VECT_PLUS_ADD,
 		VECT_POW_ADD, VECT_MIN_ADD, VECT_MAX_ADD,
@@ -71,6 +72,12 @@ public class CNodeBinary extends CNode
 				case DOT_PRODUCT:   
 					return sparse ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
 									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+				case VECT_MATRIXMULT:   
+					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
+									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+				case VECT_OUTERMULT_ADD:   
+					return sparse ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
 				
 				//vector-scalar-add operations
 				case VECT_MULT_ADD:
@@ -88,10 +95,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_ADD: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, len);\n" : 
+						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
 					else	
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, len);\n" : 
+						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
 				}
 				
@@ -111,10 +118,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_SCALAR: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, len);\n" : 
+						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
 					else	
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, len);\n" : 
+						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
 				}
 				
@@ -133,7 +140,7 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL: {
 					String vectName = getVectorPrimitiveName();
 					return sparse ? 
-						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : 
+						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : 
 						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				}
 				
@@ -185,7 +192,8 @@ public class CNodeBinary extends CNode
 		}
 		public boolean isVectorPrimitive() {
 			return isVectorScalarPrimitive() 
-				|| isVectorVectorPrimitive();
+				|| isVectorVectorPrimitive()
+				|| isVectorMatrixPrimitive();
 		}
 		public boolean isVectorScalarPrimitive() {
 			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR 
@@ -204,6 +212,10 @@ public class CNodeBinary extends CNode
 				|| this == VECT_LESS || this == VECT_LESSEQUAL
 				|| this == VECT_GREATER || this == VECT_GREATEREQUAL;
 		}
+		public boolean isVectorMatrixPrimitive() {
+			return this == VECT_MATRIXMULT
+				|| this == VECT_OUTERMULT_ADD;
+		}
 		public BinType getVectorAddPrimitive() {
 			return BinType.valueOf("VECT_"+getVectorPrimitiveName().toUpperCase()+"_ADD");
 		}
@@ -257,19 +269,32 @@ public class CNodeBinary extends CNode
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace input references and start indexes
-		for( int j=1; j<=2; j++ ) {
-			String varj = _inputs.get(j-1).getVarname();
+		for( int j=0; j<2; j++ ) {
+			String varj = _inputs.get(j).getVarname();
 			
 			//replace sparse and dense inputs
-			tmp = tmp.replace("%IN"+j+"v%", varj+"vals");
-			tmp = tmp.replace("%IN"+j+"i%", varj+"ix");
-			tmp = tmp.replace("%IN"+j+"%", varj );
+			tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals");
+			tmp = tmp.replace("%IN"+(j+1)+"i%", varj+"ix");
+			tmp = tmp.replace("%IN"+(j+1)+"%", 
+				varj.startsWith("b") ? varj + ".ddat" : varj );
 			
 			//replace start position of main input
-			tmp = tmp.replace("%POS"+j+"%", (_inputs.get(j-1) instanceof CNodeData 
-				&& _inputs.get(j-1).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? 
-				varj+"i" : TemplateUtils.isMatrix(_inputs.get(j-1)) ? "rowIndex*len" : "0" : "0");
+			tmp = tmp.replace("%POS"+(j+1)+"%", (_inputs.get(j) instanceof CNodeData 
+				&& _inputs.get(j).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? varj+"i" : 
+				(TemplateUtils.isMatrix(_inputs.get(j)) && _type!=BinType.VECT_MATRIXMULT) ? 
+				"rowIndex*"+((_type==BinType.VECT_OUTERMULT_ADD)?"%LEN"+(j+1)+"%":"%LEN%") : "0" : "0");
+		}
+		//replace length information (e.g., after matrix mult)
+		if( _type == BinType.VECT_OUTERMULT_ADD ) {
+			for( int j=0; j<2; j++ )
+				tmp = tmp.replace("%LEN"+(j+1)+"%", _inputs.get(j).getVectorLength());
+		}
+		else { //general case 
+			CNode mInput = getIntermediateInputVector();
+			if( mInput != null )
+				tmp = tmp.replace("%LEN%", mInput.getVectorLength());
 		}
+		
 		sb.append(tmp);
 		
 		//mark as generated
@@ -278,10 +303,19 @@ public class CNodeBinary extends CNode
 		return sb.toString();
 	}
 	
+	private CNode getIntermediateInputVector() {
+		for( int i=0; i<2; i++ )
+			if( getInput().get(i).getDataType().isMatrix() )
+				return getInput().get(i);
+		return null;
+	} 
+	
 	@Override
 	public String toString() {
 		switch(_type) {
 			case DOT_PRODUCT:              return "b(dot)";
+			case VECT_MATRIXMULT:          return "b(vmm)";
+			case VECT_OUTERMULT_ADD:       return "b(voma)";
 			case VECT_MULT_ADD:            return "b(vma)";
 			case VECT_DIV_ADD:             return "b(vda)";
 			case VECT_MINUS_ADD:           return "b(vmia)";
@@ -362,7 +396,13 @@ public class CNodeBinary extends CNode
 				boolean vectorScalar = _inputs.get(1).getDataType()==DataType.SCALAR;
 				_rows = _inputs.get(vectorScalar ? 0 : 1)._rows;
 				_cols = _inputs.get(vectorScalar ? 0 : 1)._cols;
-				_dataType= DataType.MATRIX;
+				_dataType = DataType.MATRIX;
+				break;
+			
+			case VECT_OUTERMULT_ADD:
+				_rows = _inputs.get(0)._cols;
+				_cols = _inputs.get(1)._cols;
+				_dataType = DataType.MATRIX;
 				break;
 				
 			case VECT_DIV_SCALAR: 	
@@ -396,8 +436,13 @@ public class CNodeBinary extends CNode
 				_cols = _inputs.get(scalarVector ? 1 : 0)._cols;
 				_dataType= DataType.MATRIX;
 				break;
+			
+			case VECT_MATRIXMULT:
+				_rows = _inputs.get(0)._rows;
+				_cols = _inputs.get(1)._cols;
+				_dataType = DataType.MATRIX;
+				break;
 				
-		
 			case DOT_PRODUCT: 
 			
 			//SCALAR Arithmetic

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
index d6a1d34..01ca08e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
@@ -39,10 +39,10 @@ public class CNodeOuterProduct extends CNodeTpl
 			+ "  public %TMP%() {\n"
 			+ "    _outerProductType = OutProdType.%TYPE%;\n"
 			+ "  }\n"
-			+ "  protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int len, int rowIndex, int colIndex) { \n"
 			+ "%BODY_dense%"
 			+ "  }\n"
-			+ "  protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int len, int rowIndex, int colIndex) { \n"
 			+ "%BODY_cellwise%"
 			+ "    return %OUT_cellwise%;\n"
 			+ "  }\n"			
@@ -86,7 +86,7 @@ public class CNodeOuterProduct extends CNodeTpl
 			tmp = tmp.replace("%OUT_cellwise%", getCurrentVarName());
 		}
 		//replace size information
-		tmp = tmp.replace("%LEN%", "k");
+		tmp = tmp.replace("%LEN%", "len");
 		
 		tmp = tmp.replace("%POSOUT%", "ci");
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
index 7cba5f7..b74b79d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
@@ -22,6 +22,7 @@ package org.apache.sysml.hops.codegen.cplan;
 import java.util.ArrayList;
 
 import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.TemplateUtils;
 import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
@@ -32,25 +33,26 @@ public class CNodeRow extends CNodeTpl
 	private static final String TEMPLATE = 
 			  "package codegen;\n"
 			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofOperator.SideInput;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;\n"
 			+ "import org.apache.commons.math3.util.FastMath;\n"
 			+ "\n"
 			+ "public final class %TMP% extends SpoofRowwise { \n"
 			+ "  public %TMP%() {\n"
-			+ "    super(RowType.%TYPE%, %CBIND0%, %VECT_MEM%);\n"
+			+ "    super(RowType.%TYPE%, %CBIND0%, %TB1%, %VECT_MEM%);\n"
 			+ "  }\n"
-			+ "  protected void genexec(double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex) { \n"
+			+ "  protected void genexec(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int len, int rowIndex) { \n"
 			+ "%BODY_dense%"
 			+ "  }\n"
-			+ "  protected void genexec(double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int alen, int len, int rowIndex) { \n"
+			+ "  protected void genexec(double[] avals, int[] aix, int ai, SideInput[] b, double[] scalars, double[] c, int alen, int len, int rowIndex) { \n"
 			+ "%BODY_sparse%"
 			+ "  }\n"			
 			+ "}\n";
 
 	private static final String TEMPLATE_ROWAGG_OUT  = "    c[rowIndex] = %IN%;\n";
 	private static final String TEMPLATE_FULLAGG_OUT = "    c[0] += %IN%;\n";
-	private static final String TEMPLATE_NOAGG_OUT   = "    LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*len, len);\n";
+	private static final String TEMPLATE_NOAGG_OUT   = "    LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*%LEN%, %LEN%);\n";
 	
 	public CNodeRow(ArrayList<CNode> inputs, CNode output ) {
 		super(inputs, output);
@@ -59,14 +61,6 @@ public class CNodeRow extends CNodeTpl
 	private RowType _type = null; //access pattern 
 	private int _numVectors = -1; //number of intermediate vectors
 	
-	public void setNumVectorIntermediates(int num) {
-		_numVectors = num;
-	}
-	
-	public int getNumVectorIntermediates() {
-		return _numVectors;
-	}
-	
 	public void setRowType(RowType type) {
 		_type = type;
 		_hash = 0;
@@ -76,6 +70,15 @@ public class CNodeRow extends CNodeTpl
 		return _type;
 	}
 	
+	public void setNumVectorIntermediates(int num) {
+		_numVectors = num;
+		_hash = 0;
+	}
+	
+	public int getNumVectorIntermediates() {
+		return _numVectors;
+	}
+	
 	@Override
 	public void renameInputs() {
 		rRenameDataNode(_output, _inputs.get(0), "a"); // input matrix
@@ -108,18 +111,26 @@ public class CNodeRow extends CNodeTpl
 		tmp = tmp.replace("%TYPE%", _type.name());
 		tmp = tmp.replace("%CBIND0%", String.valueOf(
 			TemplateUtils.isUnary(_output, UnaryType.CBIND0)));
+		tmp = tmp.replace("%TB1%", String.valueOf(
+			TemplateUtils.containsBinary(_output, BinType.VECT_MATRIXMULT)));
 		tmp = tmp.replace("%VECT_MEM%", String.valueOf(_numVectors));
 		
 		return tmp;
 	}
 	
 	private String getOutputStatement(String varName) {
-		if( !_type.isColumnAgg() ) {
-			String tmp = (_type==RowType.NO_AGG) ? TEMPLATE_NOAGG_OUT : 
-				(_type==RowType.FULL_AGG) ? TEMPLATE_FULLAGG_OUT : TEMPLATE_ROWAGG_OUT;
-			return tmp.replace("%IN%", varName);
+		switch( _type ) {
+			case NO_AGG:
+			case NO_AGG_B1:
+				return TEMPLATE_NOAGG_OUT.replace("%IN%", varName)
+					.replace("%LEN%", _output.getVarname()+".length");
+			case FULL_AGG:
+				return TEMPLATE_FULLAGG_OUT.replace("%IN%", varName);
+			case ROW_AGG:
+				return TEMPLATE_ROWAGG_OUT.replace("%IN%", varName);
+			default:
+				return ""; //_type.isColumnAgg()
 		}
-		return "";
 	}
 
 	@Override
@@ -131,12 +142,15 @@ public class CNodeRow extends CNodeTpl
 	@Override
 	public SpoofOutputDimsType getOutputDimType() {
 		switch( _type ) {
-			case NO_AGG: return SpoofOutputDimsType.INPUT_DIMS;
-			case FULL_AGG: return SpoofOutputDimsType.SCALAR;
-			case ROW_AGG: return TemplateUtils.isUnary(_output, UnaryType.CBIND0) ?
-				SpoofOutputDimsType.ROW_DIMS2 : SpoofOutputDimsType.ROW_DIMS;
-			case COL_AGG: return SpoofOutputDimsType.COLUMN_DIMS_COLS; //row vector
+			case NO_AGG:    return SpoofOutputDimsType.INPUT_DIMS;
+			case NO_AGG_B1: return SpoofOutputDimsType.ROW_RANK_DIMS;
+			case FULL_AGG:  return SpoofOutputDimsType.SCALAR;
+			case ROW_AGG:   return TemplateUtils.isUnary(_output, UnaryType.CBIND0) ?
+						SpoofOutputDimsType.ROW_DIMS2 : SpoofOutputDimsType.ROW_DIMS;
+			case COL_AGG:   return SpoofOutputDimsType.COLUMN_DIMS_COLS; //row vector
 			case COL_AGG_T: return SpoofOutputDimsType.COLUMN_DIMS_ROWS; //column vector
+			case COL_AGG_B1:   return SpoofOutputDimsType.COLUMN_RANK_DIMS; 
+			case COL_AGG_B1_T: return SpoofOutputDimsType.COLUMN_RANK_DIMS_T; 
 			default:
 				throw new RuntimeException("Unsupported row type: "+_type.toString());
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 500b309..85800b8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -185,6 +185,10 @@ public class CNodeUnary extends CNode
 		tmp = tmp.replace("%POS1%", spos);
 		tmp = tmp.replace("%POS2%", spos);
 		
+		//replace length
+		if( _inputs.get(0).getDataType().isMatrix() )
+			tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength());
+		
 		sb.append(tmp);
 		
 		//mark as generated

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 601d664..c0c8c4e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -50,6 +50,7 @@ import org.apache.sysml.hops.Hop.Direction;
 import org.apache.sysml.hops.Hop.OpOp1;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysml.runtime.matrix.data.Pair;
 
 public class TemplateRow extends TemplateBase 
@@ -73,8 +74,17 @@ public class TemplateRow extends TemplateBase
 	public boolean open(Hop hop) {
 		return (hop instanceof BinaryOp && hop.dimsKnown() && isValidBinaryOperation(hop)
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
-			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && hop.getDim2()==1
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && hop.getDim2()==1 //MV
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && LibMatrixMult.isSkinnyRightHandSide(
+				hop.getInput().get(0).getDim1(), hop.getInput().get(0).getDim2(), //MM
+				hop.getInput().get(1).getDim1(), hop.getInput().get(1).getDim2())
+				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1
+				&& !HopRewriteUtils.isOuterProductLikeMM(hop))
+			|| (HopRewriteUtils.isTransposeOperation(hop) && hop.getParent().size()==1
+				&& hop.getParent().get(0) instanceof AggBinaryOp && hop.getParent().get(0).dimsKnown()
+				&& hop.getParent().get(0).getInput().indexOf(hop) == 0
+				&& isFuseSkinnyMatrixMult(hop.getParent().get(0)))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1
 				&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG));
@@ -88,20 +98,24 @@ public class TemplateRow extends TemplateBase
 				&& input.getDim2()==1 && hop.getInput().get(1).getDim2()==1
 				&& HopRewriteUtils.isEmpty(hop.getInput().get(1)))
 			|| ((hop instanceof UnaryOp || hop instanceof ParameterizedBuiltinOp) 
-					&& TemplateCell.isValidOperation(hop))		
+					&& TemplateCell.isValidOperation(hop))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol
 				&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection() == Direction.RowCol 
 				&& ((AggUnaryOp)hop).getOp() == AggOp.SUM )
-			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 && hop.getDim2()==1
-				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 && hop.getDim2()==1 //MV
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)))
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && isFuseSkinnyMatrixMult(hop) //MM
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))
+				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1));
 	}
 
 	@Override
 	public boolean merge(Hop hop, Hop input) {
 		//merge rowagg tpl with cell tpl if input is a vector
 		return !isClosed() &&
-			((hop instanceof BinaryOp && isValidBinaryOperation(hop))
+			((hop instanceof BinaryOp && isValidBinaryOperation(hop)
+				&& hop.getDim1() > 1 && input.getDim1()>1) 
 			 ||(hop instanceof AggBinaryOp && input.getDim2()==1
 				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
 	}
@@ -117,12 +131,18 @@ public class TemplateRow extends TemplateBase
 			return CloseType.OPEN;
 	}
 	
-	private boolean isValidBinaryOperation(Hop hop) {
-		//exclude unsupported and matrix-rowvector ops
-		return TemplateUtils.isOperationSupported(hop)
-			&& (HopRewriteUtils.isBinaryMatrixScalarOperation(hop)
-			|| HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
-			|| HopRewriteUtils.isBinaryMatrixMatrixOperation(hop));
+	private static boolean isValidBinaryOperation(Hop hop) {
+		//support for matrix-scalar, matrix-col_vector,
+		//matrix-row_vector, and matrix-matrix
+		return TemplateUtils.isOperationSupported(hop);
+	}
+	
+	private static boolean isFuseSkinnyMatrixMult(Hop hop) {
+		//check for fusable but not opening matrix multiply (vect_outer-mult)
+		Hop in1 = hop.getInput().get(0); //transpose
+		Hop in2 = hop.getInput().get(1);
+		return LibMatrixMult.isSkinnyRightHandSide(in1.getDim2(), in1.getDim1(), hop.getDim1(), hop.getDim2())
+			|| LibMatrixMult.isSkinnyRightHandSide(in2.getDim1(), in2.getDim2(), hop.getDim2(), hop.getDim1());
 	}
 
 	@Override
@@ -138,7 +158,7 @@ public class TemplateRow extends TemplateBase
 		//reorder inputs (ensure matrix is first input, and other inputs ordered by size)
 		Hop[] sinHops = inHops.stream()
 			.filter(h -> !(h.getDataType().isScalar() && tmp.get(h.getHopID()).isLiteral()))
-			.sorted(new HopInputComparator(inHops2.get("X"))).toArray(Hop[]::new);
+			.sorted(new HopInputComparator(inHops2.get("X"),inHops2.get("B1"))).toArray(Hop[]::new);
 		
 		//construct template node
 		ArrayList<CNode> inputs = new ArrayList<CNode>();
@@ -146,7 +166,8 @@ public class TemplateRow extends TemplateBase
 			inputs.add(tmp.get(in.getHopID()));
 		CNode output = tmp.get(hop.getHopID());
 		CNodeRow tpl = new CNodeRow(inputs, output);
-		tpl.setRowType(TemplateUtils.getRowType(hop, sinHops[0]));
+		tpl.setRowType(TemplateUtils.getRowType(hop, 
+			inHops2.get("X"), inHops2.get("B1")));
 		tpl.setNumVectorIntermediates(TemplateUtils
 			.determineMinVectorIntermediates(output));
 		tpl.getOutput().resetVisitStatus();
@@ -217,7 +238,13 @@ public class TemplateRow extends TemplateBase
 				inHops.add(hop.getInput().get(0).getInput().get(0));
 				
 				//note: vectorMultAdd applicable to vector-scalar, and vector-vector
-				out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+				if( hop.getInput().get(1).getDim2() == 1 )
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+				else {
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_OUTERMULT_ADD);
+					if( !inHops2.containsKey("B1") )
+						inHops2.put("B1", hop.getInput().get(1));
+				}
 				inHops2.put("X", hop.getInput().get(0).getInput().get(0));
 			}
 			else
@@ -225,12 +252,24 @@ public class TemplateRow extends TemplateBase
 				if(hop.getInput().get(0).getDim2()==1 && hop.getInput().get(1).getDim2()==1)
 					out = new CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new CNodeUnary(cdata1, UnaryType.LOOKUP0),
 						(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, UnaryType.LOOKUP0), BinType.MULT);
-				else {
+				else if( hop.getInput().get(1).getDim2()==1 ) {
 					out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
 					inHops2.put("X", hop.getInput().get(0));
 				}
+				else {
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MATRIXMULT);
+					inHops2.put("X", hop.getInput().get(0));
+					inHops2.put("B1", hop.getInput().get(1));
+				}
 			}
 		}
+		else if( HopRewriteUtils.isTransposeOperation(hop) ) 
+		{
+			out = TemplateUtils.skipTranspose(tmp.get(hop.getHopID()), 
+				hop, tmp, compileLiterals);
+			if( out instanceof CNodeData && !inHops.contains(hop.getInput().get(0)) )
+				inHops.add(hop.getInput().get(0));
+		}
 		else if(hop instanceof UnaryOp)
 		{
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
@@ -272,7 +311,8 @@ public class TemplateRow extends TemplateBase
 				|| (hop.getInput().get(1).getDim1() > 1 && hop.getInput().get(1).getDim2() > 1))
 			{
 				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
-					if( TemplateUtils.isMatrix(cdata1) && TemplateUtils.isMatrix(cdata2) ) {
+					if( TemplateUtils.isMatrix(cdata1) && (TemplateUtils.isMatrix(cdata2) 
+							|| TemplateUtils.isRowVector(cdata2)) ) {
 						String opname = "VECT_"+((BinaryOp)hop).getOp().name();
 						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
 					}
@@ -360,19 +400,21 @@ public class TemplateRow extends TemplateBase
 	public static class HopInputComparator implements Comparator<Hop> 
 	{
 		private final Hop _X;
+		private final Hop _B1;
 		
-		public HopInputComparator(Hop X) {
+		public HopInputComparator(Hop X, Hop B1) {
 			_X = X;
+			_B1 = B1;
 		}
 		
 		@Override
 		public int compare(Hop h1, Hop h2) {
 			long ncells1 = h1.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h1==_X) ? Long.MAX_VALUE : 
-				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-1;
+				(h1==_X) ? Long.MAX_VALUE : (h1==_B1) ? Long.MAX_VALUE-1 : 
+				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-2;
 			long ncells2 = h2.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h2==_X) ? Long.MAX_VALUE : 
-				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-1;
+				(h2==_X) ? Long.MAX_VALUE : (h2==_B1) ? Long.MAX_VALUE-1 : 
+				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-2;
 			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0; 
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index da803cd..4bd5bf1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -19,11 +19,7 @@
 
 package org.apache.sysml.hops.codegen.template;
 
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
 
 import org.apache.commons.lang.ArrayUtils;
 import org.apache.sysml.hops.AggBinaryOp;
@@ -142,74 +138,7 @@ public class TemplateUtils
 			return TernaryType.contains(((ParameterizedBuiltinOp)h).getOp().name());
 		return false;
 	}
-
-	private static void rfindChildren(Hop hop, HashSet<Hop> children ) {		
-		if( hop instanceof UnaryOp || (hop instanceof BinaryOp && hop.getInput().get(0).getDataType() == DataType.MATRIX  &&  TemplateUtils.isVectorOrScalar( hop.getInput().get(1))) || (hop instanceof BinaryOp && TemplateUtils.isVectorOrScalar( hop.getInput().get(0))  &&  hop.getInput().get(1).getDataType() == DataType.MATRIX)    //unary operation or binary operaiton with one matrix and a scalar
-					&& 	hop.getDataType() == DataType.MATRIX )
-		{	
-			if(!children.contains(hop))
-				children.add(hop);
-			Hop matrix = TemplateUtils.isMatrix(hop.getInput().get(0)) ? hop.getInput().get(0) : hop.getInput().get(1);
-			rfindChildren(matrix,children);
-		}
-		else 
-			children.add(hop);
-	}
 	
-	private static Hop findCommonChild(Hop hop1, Hop hop2) {
-		//this method assumes that each two nodes have at most one common child 
-		LinkedHashSet<Hop> children1 = new LinkedHashSet<Hop>();
-		LinkedHashSet<Hop> children2 = new LinkedHashSet<Hop>();
-		
-		rfindChildren(hop1, children1 );
-		rfindChildren(hop2, children2 );
-		
-		//iterate on one set and find the first common child in the other set
-		Iterator<Hop> iter = children1.iterator();
-		while (iter.hasNext()) {
-			Hop candidate = iter.next();
-			if(children2.contains(candidate))
-				return candidate;
-		}
-		return null;
-	}
-	
-	public static Hop commonChild(ArrayList<Hop> _adddedMatrices, Hop input) {
-		Hop currentChild = null;
-		//loop on every added matrix and find its common child with the input, if all of them have the same common child then return it, otherwise null 
-		for(Hop addedMatrix : _adddedMatrices)
-		{
-			Hop child = findCommonChild(addedMatrix,input);
-			if(child == null)  // did not find a common child
-				return null;
-			if(currentChild == null) // first common child to be seen
-				currentChild = child;
-			else if(child.getHopID() != currentChild.getHopID())
-				return null;
-		}
-		return currentChild;
-	}
-
-	public static HashSet<Long> rGetInputHopIDs( CNode node, HashSet<Long> ids ) {
-		if( node instanceof CNodeData && !node.isLiteral() )
-			ids.add(((CNodeData)node).getHopID());
-		
-		for( CNode c : node.getInput() )
-			rGetInputHopIDs(c, ids);
-			
-		return ids;
-	}
-	
-	public static Hop[] mergeDistinct(HashSet<Long> ids, Hop[] input1, Hop[] input2) {
-		Hop[] ret = new Hop[ids.size()];
-		int pos = 0;
-		for( Hop[] input : new Hop[][]{input1, input2} )
-			for( Hop c : input )
-				if( ids.contains(c.getHopID()) )
-					ret[pos++] = c; 
-		return ret;
-	}
-
 	public static TemplateBase createTemplate(TemplateType type) {
 		return createTemplate(type, false);
 	}
@@ -242,21 +171,31 @@ public class TemplateUtils
 			CellType.FULL_AGG : CellType.ROW_AGG) : CellType.NO_AGG;
 	}
 	
-	public static RowType getRowType(Hop output, Hop input) {
-		if( HopRewriteUtils.isEqualSize(output, input) )
+	public static RowType getRowType(Hop output, Hop... inputs) {
+		Hop X = inputs[0];
+		Hop B1 = (inputs.length>1) ? inputs[1] : null;
+		if( HopRewriteUtils.isEqualSize(output, X) )
 			return RowType.NO_AGG;
-		else if( output.getDim1()==input.getDim1() && (output.getDim2()==1 
+		else if( B1 != null && output.getDim1()==X.getDim1() && output.getDim2()==B1.getDim2() )
+			return RowType.NO_AGG_B1;
+		else if( output.getDim1()==X.getDim1() && (output.getDim2()==1 
 				|| HopRewriteUtils.isBinary(output, OpOp2.CBIND)) 
 			&& !(output instanceof AggBinaryOp && HopRewriteUtils
-				.isTransposeOfItself(output.getInput().get(0),input)))
+				.isTransposeOfItself(output.getInput().get(0),X)))
 			return RowType.ROW_AGG;
 		else if( output instanceof AggUnaryOp 
 			&& ((AggUnaryOp)output).getDirection()==Direction.RowCol )
 			return RowType.FULL_AGG;
-		else if( output.getDim1()==input.getDim2() && output.getDim2()==1 )
+		else if( output.getDim1()==X.getDim2() && output.getDim2()==1 )
 			return RowType.COL_AGG_T;
-		else
+		else if( output.getDim1()==1 && output.getDim2()==X.getDim2() )
 			return RowType.COL_AGG;
+		else if( B1 != null && output.getDim1()==X.getDim2() && output.getDim2()==B1.getDim2() )
+			return RowType.COL_AGG_B1_T;
+		else if( B1 != null && output.getDim1()==B1.getDim2() && output.getDim2()==X.getDim2())
+			return RowType.COL_AGG_B1;
+		else
+			throw new RuntimeException("Unknown row type.");
 	}
 	
 	public static AggOp getAggOp(Hop hop) {
@@ -293,6 +232,11 @@ public class TemplateUtils
 			&& ArrayUtils.contains(types, ((CNodeUnary)node).getType());
 	}
 	
+	public static boolean isBinary(CNode node, BinType...types) {
+		return node instanceof CNodeBinary
+			&& ArrayUtils.contains(types, ((CNodeBinary)node).getType());
+	}
+	
 	public static boolean isTernary(CNode node, TernaryType...types) {
 		return node instanceof CNodeTernary
 			&& ArrayUtils.contains(types, ((CNodeTernary)node).getType());
@@ -333,7 +277,8 @@ public class TemplateUtils
 		CNode output = tpl.getOutput();
 		return ((output instanceof CNodeUnary 
 				&& !TemplateUtils.isUnary(output, UnaryType.EXP, UnaryType.LOG)) 
-			|| output instanceof CNodeBinary) 
+			|| (output instanceof CNodeBinary
+				&& !TemplateUtils.isBinary(output, BinType.VECT_OUTERMULT_ADD))) 
 			&& hasOnlyDataNodeOrLookupInputs(output);
 	}
 	
@@ -365,8 +310,7 @@ public class TemplateUtils
 	public static boolean isUnaryOperatorPipeline(CNode node) {
 		if( node.isVisited() ) {
 			//second reference to vector intermediate invalidates a unary pipeline
-			return !((node instanceof CNodeBinary && ((CNodeBinary)node).getType().isVectorPrimitive())
-				|| (node instanceof CNodeUnary && ((CNodeUnary)node).getType().isVectorScalarPrimitive()));
+			return !(node instanceof CNodeBinary && ((CNodeBinary)node).getType().isVectorPrimitive());
 		}
 		boolean ret = true;
 		for( CNode input : node.getInput() )
@@ -382,8 +326,9 @@ public class TemplateUtils
 		for( CNode input : node.getInput() )
 			max = Math.max(max, getMaxVectorIntermediates(input));
 		max = Math.max(max, (node instanceof CNodeBinary)? 
-			((CNodeBinary)node).getType().isVectorVectorPrimitive() ? 3 :
-			((CNodeBinary)node).getType().isVectorScalarPrimitive() ? 2 : 0 : 0);
+			(((CNodeBinary)node).getType().isVectorVectorPrimitive() ? 3 :
+			((CNodeBinary)node).getType().isVectorScalarPrimitive() ? 2 :
+			((CNodeBinary)node).getType().isVectorMatrixPrimitive() ? 1 : 0) : 0);
 		max = Math.max(max, (node instanceof CNodeUnary 
 			&& ((CNodeUnary)node).getType().isVectorScalarPrimitive()) ? 2 : 0);
 		node.setVisited();
@@ -432,4 +377,22 @@ public class TemplateUtils
 		}
 		return ret;
 	}
+	
+	public static boolean containsBinary(CNode node, BinType type) {
+		node.resetVisitStatus();
+		boolean ret = rContainsBinary(node, type);
+		node.resetVisitStatus();
+		return ret;
+	}
+	
+	public static boolean rContainsBinary(CNode node, BinType type) {
+		if( node.isVisited() )
+			return false;
+		boolean ret = false;
+		for( CNode input : node.getInput() )
+			ret |= rContainsBinary(input, type);
+		ret |= isBinary(node, type);
+		node.setVisited();
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index ad2530d..1108c08 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.runtime.codegen;
 
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.LinkedList;
 
 import org.apache.commons.math3.util.FastMath;
@@ -57,6 +58,50 @@ public class LibSpoofPrimitives
 		return LibMatrixMult.dotProduct(a, b, aix, ai, bi, len);
 	}
 	
+	public static double[] vectMatrixMult(double[] a, double[] b, int ai, int bi, int len) {
+		//note: assumption b is already transposed for efficient dot products
+		int m2clen = b.length / len;
+		double[] c = allocVector(m2clen, false);
+		for( int j = 0, bix = bi; j < m2clen; j++, bix+=len )
+			c[j] = LibMatrixMult.dotProduct(a, b, ai, bix, len);
+		return c;
+	}
+	
+	public static double[] vectMatrixMult(double[] a, double[] b, int[] aix, int ai, int bi, int alen, int len) {
+		//note: assumption b is already transposed for efficient dot products
+		int m2clen = b.length / len;
+		double[] c = allocVector(m2clen, false);
+		for( int j = 0, bix = bi; j < m2clen; j++, bix+=len )
+			c[j] = LibMatrixMult.dotProduct(a, b, aix, ai, bix, alen);
+		return c;
+	}
+	
+	public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int ai, int bi, int ci, int len1, int len2) {
+		//rest, not aligned to 4-blocks
+		final int bn = len1%4;
+		for( int i=0, cix=ci; i < bn; i++, cix+=len2 )
+			if( a[ai+i] != 0 )
+				LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, cix, len2);
+		
+		//unrolled 4-block (for fewer L1-dcache loads)
+		for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, cix+=4*len2 ) {
+			final int cix1=cix, cix2=cix+len2, cix3=cix+2*len2, cix4=cix+3*len2;
+			final double aval1=a[ai+i], aval2=a[ai+i+1], aval3=a[ai+i+2], aval4=a[ai+i+3];
+			for( int j=0; j<len2; j++ ) {
+				final double bval = b[bi+j];
+				c[cix1 + j] += aval1 * bval;
+				c[cix2 + j] += aval2 * bval;
+				c[cix3 + j] += aval3 * bval;
+				c[cix4 + j] += aval4 * bval;
+			}
+		}	
+	}
+	
+	public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int[] aix, int ai, int bi, int ci, int alen, int len1, int len2) {
+		for( int i=0; i < alen; i++ )
+			LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, ci+aix[ai+i]*len2, len2);
+	}
+	
 	public static void vectMultAdd(double[] a, double bval, double[] c, int bi, int ci, int len) {
 		if( a == null || bval == 0 ) return;
 		LibMatrixMult.vectMultiplyAdd(bval, a, c, bi, ci, len);
@@ -1227,7 +1272,14 @@ public class LibSpoofPrimitives
 	//dynamic memory management
 	
 	public static void setupThreadLocalMemory(int numVectors, int len) {
+		setupThreadLocalMemory(numVectors, len, -1);
+	}
+	
+	public static void setupThreadLocalMemory(int numVectors, int len, int len2) {
 		LinkedList<double[]> list = new LinkedList<double[]>();
+		if( len2 >= 0 ) 
+			for( int i=0; i<numVectors; i++ )
+				list.addLast(new double[len2]);
 		for( int i=0; i<numVectors; i++ )
 			list.addLast(new double[len]);
 		memPool.set(list);
@@ -1242,24 +1294,29 @@ public class LibSpoofPrimitives
 	}
 	
 	private static double[] allocVector(int len, boolean reset, double resetVal) {
-		LinkedList<double[]> list = memPool.get();
+		LinkedList<double[]> list = memPool.get(); 
 		
-		//sanity check for missing setup
-		if( list.isEmpty() ) {
-			double[] tmp = new double[len];
-			if( reset && resetVal != 0 )
-				Arrays.fill(tmp, resetVal);
-			return tmp;
+		//find and remove vector with matching len 
+		double[] vect = null;
+		Iterator<double[]> iter = list.iterator();
+		while( iter.hasNext() ) {
+			double[] tmp = iter.next();
+			if( tmp.length == len ) {
+				vect = tmp;
+				iter.remove();
+				break;
+			}
 		}
 		
-		//get and re-queue first entry
-		double[] tmp = list.removeFirst();
-		list.addLast(tmp);
+		//allocate new vector or re-queue if required
+		if( vect == null )
+			vect = new double[len];
+		else 
+			list.addLast(vect);
 		
 		//reset vector if required
 		if( reset )
-			Arrays.fill(tmp, resetVal);
-		return tmp;
+			Arrays.fill(vect, resetVal);
+		return vect;
 	}
 }
-

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index cc8ef69..15de508 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -118,7 +118,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		}
 		
 		//input preparation
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
@@ -198,7 +198,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		
 		//input preparation
 		MatrixBlock a = inputs.get(0);
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = a.getNumRows();
 		final int n = a.getNumColumns();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index e7e3b54..c3755d4 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -91,7 +91,7 @@ public abstract class SpoofMultiAggregate extends SpoofOperator implements Seria
 		setInitialOutputValues(c);
 		
 		//input preparation
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index d3bf410..9561fcb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.util.DataConverter;
 import org.apache.sysml.runtime.util.UtilFunctions;
@@ -59,72 +60,62 @@ public abstract class SpoofOperator implements Serializable
 		return execute(inputs, scalars);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesDense(inputs, 1, inputs.size()-1);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, false, false);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs, int offset) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesDense(inputs, offset, inputs.size()-offset);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, boolean denseOnly) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, denseOnly, false);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs, int offset, int len) 
-		throws DMLRuntimeException 
-	{
-		double[][] b = new double[len][]; 
-		for(int i=offset; i<offset+len; i++) {
-			if( inputs.get(i) instanceof CompressedMatrixBlock ) 
-				inputs.set(i, ((CompressedMatrixBlock)inputs.get(i)).decompress());
-			
-			//convert empty or sparse to dense temporary block (note: we don't do
-			//this in place because this block might be used by multiple threads)
-			if( inputs.get(i).isInSparseFormat() && inputs.get(i).isAllocated() ) {
-				MatrixBlock tmp = inputs.get(i);
-				b[i-offset] = DataConverter.convertToDoubleVector(tmp);
-				LOG.warn(getClass().getName()+": Converted "+tmp.getNumRows()+"x"+tmp.getNumColumns()+
-						", nnz="+tmp.getNonZeros()+" sideways input matrix from sparse to dense.");
-			}
-			//use existing dense block
-			else {
-				b[i-offset] = inputs.get(i).getDenseBlock();
-			}
-		}
-		
-		return b;
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, int offset, boolean denseOnly) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, offset, inputs.size()-offset, denseOnly, false);
 	}
 	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesAbstract(inputs, 1, inputs.size()-1);
-	}
-	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs, int offset) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesAbstract(inputs, offset, inputs.size()-offset);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, boolean denseOnly, boolean tB1) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, denseOnly, tB1);
 	}
 	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs, int offset, int len) 
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, int offset, int len, boolean denseOnly, boolean tB1) 
 		throws DMLRuntimeException 
 	{
 		SideInput[] b = new SideInput[len]; 
 		for(int i=offset; i<offset+len; i++) {
+			//decompress if necessary
 			if( inputs.get(i) instanceof CompressedMatrixBlock ) 
 				inputs.set(i, ((CompressedMatrixBlock)inputs.get(i)).decompress());
+			//transpose if necessary
+			int clen = inputs.get(i).getNumColumns();
+			MatrixBlock in = (tB1 && i==1 ) ? LibMatrixReorg.transpose(inputs.get(i), 
+				new MatrixBlock(clen, inputs.get(i).getNumRows(), false)) : inputs.get(i);
 			
-			if( inputs.get(i).isInSparseFormat() && inputs.get(i).isAllocated() )
-				b[i-offset] = new SideInput(null, inputs.get(i));
-			else
-				b[i-offset] = new SideInput(inputs.get(i).getDenseBlock(), null);
+			//create side input
+			if( denseOnly && (in.isInSparseFormat() || !in.isAllocated()) ) {
+				//convert empty or sparse to dense temporary block (note: we don't do
+				//this in place because this block might be used by multiple threads)
+				b[i-offset] = new SideInput(DataConverter.convertToDoubleVector(in), null, clen);
+				LOG.warn(getClass().getName()+": Converted "+in.getNumRows()+"x"+in.getNumColumns()+
+					", nnz="+in.getNonZeros()+" sideways input matrix from sparse to dense.");	
+			}
+			else if( in.isInSparseFormat() && in.isAllocated() ) {
+				b[i-offset] = new SideInput(null, in, clen);
+			}
+			else {
+				b[i-offset] = new SideInput(
+					in.getDenseBlock(), null, clen);
+			}
 		}
 		
 		return b;
 	}
 	
+	public double[][] getDenseMatrices(SideInput[] inputs) {
+		double[][] ret = new double[inputs.length][];
+		for( int i=0; i<inputs.length; i++ )
+			ret[i] = inputs[i].ddat;
+		return ret;
+	}
+	
 	protected double[] prepInputScalars(ArrayList<ScalarObject> scalarObjects) {
 		double[] scalars = new double[scalarObjects.size()]; 
 		for(int i=0; i < scalarObjects.size(); i++)
@@ -161,8 +152,8 @@ public abstract class SpoofOperator implements Serializable
 	
 	protected static double getValue(SideInput data, int rowIndex) {
 		//note: wrapper sideinput guaranteed to exist
-		return (data.dBlock!=null) ? data.dBlock[rowIndex] : 
-			(data.mBlock!=null) ? data.mBlock.quickGetValue(rowIndex, 0) : 0;
+		return (data.ddat!=null) ? data.ddat[rowIndex] : 
+			(data.mdat!=null) ? data.mdat.quickGetValue(rowIndex, 0) : 0;
 	}
 	
 	protected static double getValue(SideInput data, int n, double rowIndex, double colIndex) {
@@ -173,17 +164,19 @@ public abstract class SpoofOperator implements Serializable
 	
 	protected static double getValue(SideInput data, int n, int rowIndex, int colIndex) {
 		//note: wrapper sideinput guaranteed to exist
-		return (data.dBlock!=null) ? data.dBlock[rowIndex*n+colIndex] : 
-			(data.mBlock!=null) ? data.mBlock.quickGetValue(rowIndex, colIndex) : 0;
+		return (data.ddat!=null) ? data.ddat[rowIndex*n+colIndex] : 
+			(data.mdat!=null) ? data.mdat.quickGetValue(rowIndex, colIndex) : 0;
 	}
 	
 	public static class SideInput {
-		private final double[] dBlock;
-		private final MatrixBlock mBlock;
-	
-		public SideInput(double[] ddata, MatrixBlock mdata) {
-			dBlock = ddata;
-			mBlock = mdata;
+		public final double[] ddat;
+		public final MatrixBlock mdat;
+		public final int clen;
+	
+		public SideInput(double[] ddata, MatrixBlock mdata, int clength) {
+			ddat = ddata;
+			mdat = mdata;
+			clen = clength;
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index c66d065..90c7507 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -79,8 +79,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			return new DoubleObject(0);
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute
@@ -112,8 +112,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			return new DoubleObject(0);
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute
@@ -179,8 +179,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		out.allocateDenseOrSparseBlock();
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 				
 		//core sequential execute
@@ -257,8 +257,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		}	
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 611e4ad..13536d3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -46,23 +46,32 @@ public abstract class SpoofRowwise extends SpoofOperator
 	
 	public enum RowType {
 		NO_AGG,    //no aggregation
+		NO_AGG_B1, //no aggregation w/ matrix mult B1
 		FULL_AGG,  //full row/col aggregation
 		ROW_AGG,   //row aggregation (e.g., rowSums() or X %*% v)
 		COL_AGG,   //col aggregation (e.g., colSums() or t(y) %*% X)
-		COL_AGG_T; //transposed col aggregation (e.g., t(X) %*% y)
+		COL_AGG_T, //transposed col aggregation (e.g., t(X) %*% y)
+		COL_AGG_B1,   //col aggregation w/ matrix mult B1
+		COL_AGG_B1_T; //transposed col aggregation w/ matrix mult B1
 		
 		public boolean isColumnAgg() {
-			return (this == COL_AGG || this == COL_AGG_T);
+			return (this == COL_AGG || this == COL_AGG_T)
+				|| (this == COL_AGG_B1) || (this == COL_AGG_B1_T);
 		}
+		public boolean isRowTypeB1() {
+			return (this == NO_AGG_B1) || (this == COL_AGG_B1) || (this == COL_AGG_B1_T);
+		} 
 	}
 	
 	protected final RowType _type;
 	protected final boolean _cbind0;
+	protected final boolean _tB1;
 	protected final int _reqVectMem;
 	
-	public SpoofRowwise(RowType type, boolean cbind0, int reqVectMem) {
+	public SpoofRowwise(RowType type, boolean cbind0, boolean tB1, int reqVectMem) {
 		_type = type;
 		_cbind0 = cbind0;
+		_tB1 = tB1;
 		_reqVectMem = reqVectMem;
 	}
 	
@@ -112,17 +121,18 @@ public abstract class SpoofRowwise extends SpoofOperator
 		//result allocation and preparations
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
+		final int n2 = _type.isRowTypeB1() ? inputs.get(1).getNumColumns() : -1;
 		if( !aggIncr || !out.isAllocated() )
-			allocateOutputMatrix(m, n, out);
+			allocateOutputMatrix(m, n, n2, out);
 		double[] c = out.getDenseBlock();
 		
 		//input preparation
-		double[][] b = prepInputMatricesDense(inputs);
+		SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1);
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//setup thread-local memory if necessary
 		if( allocTmp )
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n);
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n, n2);
 		
 		//core sequential execute
 		MatrixBlock a = inputs.get(0);
@@ -157,10 +167,11 @@ public abstract class SpoofRowwise extends SpoofOperator
 		//result allocation and preparations
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
-		allocateOutputMatrix(m, n, out);
+		final int n2 = _type.isRowTypeB1() ? inputs.get(1).getNumColumns() : -1;
+		allocateOutputMatrix(m, n, n2, out);
 		
 		//input preparation
-		double[][] b = prepInputMatricesDense(inputs);
+		SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1);
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core parallel execute
@@ -173,10 +184,10 @@ public abstract class SpoofRowwise extends SpoofOperator
 				//execute tasks
 				ArrayList<ParColAggTask> tasks = new ArrayList<ParColAggTask>();
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParColAggTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+					tasks.add(new ParColAggTask(inputs.get(0), b, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
 				List<Future<double[]>> taskret = pool.invokeAll(tasks);	
 				//aggregate partial results
-				int len = _type.isColumnAgg() ? n : 1;
+				int len = _type.isColumnAgg() ? out.getNumRows()*out.getNumColumns() : 1;
 				for( Future<double[]> task : taskret )
 					LibMatrixMult.vectAdd(task.get(), out.getDenseBlock(), 0, 0, len);
 				out.recomputeNonZeros();
@@ -185,7 +196,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 				//execute tasks
 				ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(inputs.get(0), b, out, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+					tasks.add(new ParExecTask(inputs.get(0), b, out, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
 				List<Future<Long>> taskret = pool.invokeAll(tasks);
 				//aggregate nnz, no need to aggregate results
 				long nnz = 0;
@@ -202,18 +213,22 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void allocateOutputMatrix(int m, int n, MatrixBlock out) {
+	private void allocateOutputMatrix(int m, int n, int n2, MatrixBlock out) {
 		switch( _type ) {
-			case NO_AGG: out.reset(m, n, false); break;
-			case FULL_AGG: out.reset(1, 1, false); break;
-			case ROW_AGG: out.reset(m, 1+(_cbind0?1:0), false); break;
-			case COL_AGG: out.reset(1, n, false); break;
-			case COL_AGG_T: out.reset(n, 1, false); break;
+			case NO_AGG:       out.reset(m, n, false); break;
+			case NO_AGG_B1:    out.reset(m, n2, false); break;
+			case FULL_AGG:     out.reset(1, 1, false); break;
+			case ROW_AGG:      out.reset(m, 1+(_cbind0?1:0), false); break;
+			case COL_AGG:      out.reset(1, n, false); break;
+			case COL_AGG_T:    out.reset(n, 1, false); break;
+			case COL_AGG_B1:   out.reset(n2, n, false); break;
+			case COL_AGG_B1_T: out.reset(n, n2, false); break;
+			
 		}
 		out.allocateDenseBlock();
 	}
 	
-	private void executeDense(double[] a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeDense(double[] a, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		if( a == null )
 			return;
@@ -224,7 +239,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeSparse(SparseBlock sblock, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		SparseRow empty = new SparseRowVector(1);
 		for( int i=rl; i<ru; i++ ) {
@@ -243,7 +258,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void executeCompressed(CompressedMatrixBlock a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeCompressed(CompressedMatrixBlock a, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		if( a.isEmptyBlock(false) )
 			return;
@@ -272,10 +287,10 @@ public abstract class SpoofRowwise extends SpoofOperator
 	//methods to be implemented by generated operators of type SpoofRowAggrgate 
 	
 	protected abstract void genexec(double[] a, int ai, 
-		double[][] b, double[] scalars, double[] c, int len, int rowIndex);
+		SideInput[] b, double[] scalars, double[] c, int len, int rowIndex);
 	
 	protected abstract void genexec(double[] avals, int[] aix, int ai, 
-		double[][] b, double[] scalars, double[] c, int alen, int n, int rowIndex);
+		SideInput[] b, double[] scalars, double[] c, int alen, int n, int rowIndex);
 
 	
 	/**
@@ -284,17 +299,19 @@ public abstract class SpoofRowwise extends SpoofOperator
 	private class ParColAggTask implements Callable<double[]> 
 	{
 		private final MatrixBlock _a;
-		private final double[][] _b;
+		private final SideInput[] _b;
 		private final double[] _scalars;
 		private final int _clen;
+		private final int _clen2;
 		private final int _rl;
 		private final int _ru;
 
-		protected ParColAggTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rl, int ru ) {
+		protected ParColAggTask( MatrixBlock a, SideInput[] b, double[] scalars, int clen, int clen2, int rl, int ru ) {
 			_a = a;
 			_b = b;
 			_scalars = scalars;
 			_clen = clen;
+			_clen2 = clen2;
 			_rl = rl;
 			_ru = ru;
 		}
@@ -303,8 +320,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 		public double[] call() throws DMLRuntimeException {
 			
 			//allocate vector intermediates and partial output
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
-			double[] c = new double[_clen];
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
+			double[] c = new double[(_clen2>0)?_clen*_clen2 : _clen];
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, c, _clen, _rl, _ru);
@@ -324,19 +341,21 @@ public abstract class SpoofRowwise extends SpoofOperator
 	private class ParExecTask implements Callable<Long> 
 	{
 		private final MatrixBlock _a;
-		private final double[][] _b;
+		private final SideInput[] _b;
 		private final MatrixBlock _c;
 		private final double[] _scalars;
 		private final int _clen;
+		private final int _clen2;
 		private final int _rl;
 		private final int _ru;
 
-		protected ParExecTask( MatrixBlock a, double[][] b, MatrixBlock c, double[] scalars, int clen, int rl, int ru ) {
+		protected ParExecTask( MatrixBlock a, SideInput[] b, MatrixBlock c, double[] scalars, int clen, int clen2, int rl, int ru ) {
 			_a = a;
 			_b = b;
 			_c = c;
 			_scalars = scalars;
 			_clen = clen;
+			_clen2 = clen2;
 			_rl = rl;
 			_ru = ru;
 		}
@@ -344,7 +363,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		@Override
 		public Long call() throws DMLRuntimeException {
 			//allocate vector intermediates
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
index 622944d..663e269 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
@@ -313,7 +313,8 @@ public class SpoofSPInstruction extends SPInstruction
 			}
 			
 			//setup local memory for reuse
-			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen);
+			int clen2 = (int) (_op.getRowType().isRowTypeB1() ? _vectors.get(0).getNumCols() : -1);
+			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen, clen2);
 			
 			ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<Tuple2<MatrixIndexes,MatrixBlock>>();
 			boolean aggIncr = (_op.getRowType().isColumnAgg() //aggregate entire partition

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
index 1a7aeb3..c58eb91 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
@@ -54,6 +54,14 @@ public class PartitionedBroadcast<T extends CacheBlock> implements Serializable
 	public Broadcast<PartitionedBlock<T>>[] getBroadcasts() {
 		return _pbc;
 	}
+	
+	public long getNumRows() {
+		return _pbc[0].value().getNumRows();
+	}
+	
+	public long getNumCols() {
+		return _pbc[0].value().getNumCols();
+	}
 
 	public int getNumRowBlocks() {
 		return _pbc[0].value().getNumRowBlocks();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 0ed0090..8159dc9 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -42,27 +42,14 @@ import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 /**
- * MB:
- * Library for matrix multiplications including MM, MV, VV for all
+ * MB: Library for matrix multiplications including MM, MV, VV for all
  * combinations of dense, sparse, ultrasparse representations and special
  * operations such as transpose-self matrix multiplication.
- * 
+ * <p>
  * In general all implementations use internally dense outputs
  * for direct access, but change the final result to sparse if necessary.
  * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.  
- * 
- * NOTES on BLAS:
- * * Experiments in 04/2013 showed that even on dense-dense this implementation 
- *   is 3x faster than f2j-BLAS-DGEMM, 2x faster than f2c-BLAS-DGEMM, and
- *   level (+10% after JIT) with a native C implementation. 
- * * Calling native BLAS would loose platform independence and would require 
- *   JNI calls incl data transfer. Furthermore, BLAS does not support sparse 
- *   matrices (except Sparse BLAS, with dedicated function calls and matrix formats) 
- *   and would be an external dependency. 
- * * Experiments in 02/2014 showed that on dense-dense this implementation now achieves
- *   almost 30% peak FP performance. Compared to Intel MKL 11.1 (dgemm, N=1000) it is
- *   just 3.2x (sparsity=1.0) and 1.9x (sparsity=0.5) slower, respectively.  
- *  
+ * <p> 
  */
 public class LibMatrixMult 
 {
@@ -3065,7 +3052,7 @@ public class LibMatrixMult
 			c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ];	
 		}
 	}
-
+	
 	@SuppressWarnings("unused")
 	private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int ci, final int len )
 	{
@@ -3492,12 +3479,16 @@ public class LibMatrixMult
 		return ret;
 	}
 
-	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 )
-	{
+	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) {
 		//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output 
 		return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-				&& m1.rlen > m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64
-				&& 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits in L2 cache
+			&& isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen));
+	}
+	
+	//note: public for use by codegen for consistency
+	public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen) {
+		return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
+			&& m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE;
 	}
 
 	private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 182adf4..e32056a 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -59,6 +59,10 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME21 = TEST_NAME+"21"; //sum(X/rowSums(X))
 	private static final String TEST_NAME22 = TEST_NAME+"22"; //((7+X)+(X-7)+exp(X))/(rowMins(X)+0.5) 
 	private static final String TEST_NAME23 = TEST_NAME+"23"; //L2SVM outer loop 
+	private static final String TEST_NAME24 = TEST_NAME+"24"; //t(X)%*%(w*(X%*%v)), w/ mm 
+	private static final String TEST_NAME25 = TEST_NAME+"25"; //-2*(X%*%t(C))+t(rowSums(C^2)), w/ mm
+	private static final String TEST_NAME26 = TEST_NAME+"26"; //t(P)%*%X, w/ mm
+	private static final String TEST_NAME27 = TEST_NAME+"27"; //t(X)%*%(X%*%v), w/ mm 
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -70,7 +74,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=23; i++)
+		for(int i=1; i<=27; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -419,6 +423,66 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME23, false, ExecType.SPARK );
 	}
 	
+	@Test	
+	public void testCodegenRowAggRewrite24CP() {
+		testCodegenIntegration( TEST_NAME24, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg24CP() {
+		testCodegenIntegration( TEST_NAME24, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg24SP() {
+		testCodegenIntegration( TEST_NAME24, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite25CP() {
+		testCodegenIntegration( TEST_NAME25, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg25CP() {
+		testCodegenIntegration( TEST_NAME25, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg25SP() {
+		testCodegenIntegration( TEST_NAME25, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite26CP() {
+		testCodegenIntegration( TEST_NAME26, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg26CP() {
+		testCodegenIntegration( TEST_NAME26, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg26SP() {
+		testCodegenIntegration( TEST_NAME26, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite27CP() {
+		testCodegenIntegration( TEST_NAME27, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg27CP() {
+		testCodegenIntegration( TEST_NAME27, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg27SP() {
+		testCodegenIntegration( TEST_NAME27, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern24.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern24.R b/src/test/scripts/functions/codegen/rowAggPattern24.R
new file mode 100644
index 0000000..5510437
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern24.R
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 600, 10, byrow=TRUE);
+w = matrix(seq(1,2400)/2400, 600, 4, byrow=TRUE);
+v = matrix(seq(1,40)/40, 10, 4, byrow=TRUE);
+
+R = t(X) %*% (w * (X %*% v));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern24.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern24.dml b/src/test/scripts/functions/codegen/rowAggPattern24.dml
new file mode 100644
index 0000000..200d552
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern24.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 600, 10);
+w = matrix(seq(1,2400)/2400, 600, 4);
+v = matrix(seq(1,40)/40, 10, 4);
+if(1==1){}
+
+R = t(X) %*% (w * (X %*% v));
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern25.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern25.R b/src/test/scripts/functions/codegen/rowAggPattern25.R
new file mode 100644
index 0000000..0e881bc
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern25.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000), 600, 10, byrow=TRUE);
+C = matrix(seq(1,40), 4, 10, byrow=TRUE);
+
+R = -2 * (X %*% t(C)) + matrix(1,nrow(X),1) %*% t(rowSums(C^2))
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern25.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern25.dml b/src/test/scripts/functions/codegen/rowAggPattern25.dml
new file mode 100644
index 0000000..fa8775e
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern25.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000), 600, 10);
+C = matrix(seq(1,40), 4, 10);
+if(1==1){}
+
+R = -2 * (X %*% t(C)) + t(rowSums(C^2))
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern26.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern26.R b/src/test/scripts/functions/codegen/rowAggPattern26.R
new file mode 100644
index 0000000..736c376
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern26.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000), 600, 10, byrow=TRUE);
+P = matrix(seq(1,3000), 600, 5, byrow=TRUE);
+
+R = t(P) %*% X;
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern26.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern26.dml b/src/test/scripts/functions/codegen/rowAggPattern26.dml
new file mode 100644
index 0000000..f84b556
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern26.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(seq(1,6000), 600, 10);
+P = matrix(seq(1,3000), 600, 5)
+if(1==1){}
+
+R = t(P) %*% X;
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern27.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern27.R b/src/test/scripts/functions/codegen/rowAggPattern27.R
new file mode 100644
index 0000000..4909732
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern27.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 600, 10, byrow=TRUE);
+v = matrix(seq(1,40)/40, 10, 4, byrow=TRUE);
+
+R = t(X) %*% (X %*% v);
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern27.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern27.dml b/src/test/scripts/functions/codegen/rowAggPattern27.dml
new file mode 100644
index 0000000..c5254c2
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern27.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 600, 10);
+v = matrix(seq(1,40)/40, 10, 4);
+if(1==1){}
+
+R = t(X) %*% (X %*% v);
+
+write(R, $1)