You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/06/30 06:13:27 UTC

[1/2] systemml git commit: [SYSTEMML-1294] Improved codegen compiler (dot products, single-ops)

Repository: systemml
Updated Branches:
  refs/heads/master 5e7e57774 -> 6b25b3bf2


[SYSTEMML-1294] Improved codegen compiler (dot products, single-ops)

This patch makes two improvements to the existing codegen compiler in
order to avoid unnecessary performance degradation in the default
optimization level 2, where rewrites and existing fused operators are
already applied to HOP DAGs before the codegen compiler is invoked.

(1) Handling of dot products: So far, we did not include transpose
operations of dot products into partial fusion plans, which led to wrong
cost estimates and thus suboptimal plan choices. 

(2) Handling of special single-operation fused operators. We now allow
ternary and expensive unary operations (such as exp or log) in fused
operators because these are automatically multi-threaded whereas our
default unary or binary operations are not.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6a4aa1d6
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6a4aa1d6
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6a4aa1d6

Branch: refs/heads/master
Commit: 6a4aa1d6a07c09b00fe87a775d7bd40993a91214
Parents: 5e7e577
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Jun 27 19:13:38 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Jun 29 23:14:04 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |  2 +-
 .../hops/codegen/template/TemplateCell.java     | 11 ++++--
 .../hops/codegen/template/TemplateUtils.java    |  6 ++-
 .../functions/codegen/RowAggTmplTest.java       | 20 +++++++++-
 .../scripts/functions/codegen/rowAggPattern23.R | 40 ++++++++++++++++++++
 .../functions/codegen/rowAggPattern23.dml       | 37 ++++++++++++++++++
 6 files changed, 108 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index fc3ecde..fede282 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -717,7 +717,7 @@ public class SpoofCompiler
 			
 			//remove cplan w/ single op and w/o agg
 			if( (tpl instanceof CNodeCell && ((((CNodeCell)tpl).getCellType()==CellType.NO_AGG
-				&& TemplateUtils.hasSingleOperation(tpl))|| TemplateUtils.hasNoOperation(tpl)))
+				&& TemplateUtils.hasSingleOperation(tpl)) || TemplateUtils.hasNoOperation(tpl)))
 				|| tpl instanceof CNodeRow && TemplateUtils.hasSingleOperation(tpl)) 
 				cplans2.remove(e.getKey());
 				

http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index e94d9a5..c73216e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -84,14 +84,19 @@ public class TemplateCell extends TemplateBase
 		return !isClosed() && (isValidOperation(hop) 
 			|| (HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_AGG) 
 				&& ((AggUnaryOp) hop).getDirection()!= Direction.Col)
-			|| (HopRewriteUtils.isMatrixMultiply(hop) && hop.getDim1()==1 && hop.getDim2()==1)
-				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)));
+			|| (HopRewriteUtils.isMatrixMultiply(hop)
+				&& hop.getDim1()==1 && hop.getDim2()==1)
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))
+			|| (HopRewriteUtils.isTransposeOperation(hop) 
+				&& hop.getDim1()==1 && hop.getDim2()>1));
 	}
 
 	@Override
 	public boolean merge(Hop hop, Hop input) {
 		//merge of other cell tpl possible
-		return (!isClosed() && isValidOperation(hop));
+		return (!isClosed() && (isValidOperation(hop) 
+			|| (hop instanceof AggBinaryOp && hop.getInput().indexOf(input)==0 
+				&& HopRewriteUtils.isTransposeOperation(input))));
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index fca203d..da803cd 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -331,8 +331,10 @@ public class TemplateUtils
 
 	public static boolean hasSingleOperation(CNodeTpl tpl) {
 		CNode output = tpl.getOutput();
-		return (output instanceof CNodeUnary || output instanceof CNodeBinary
-				|| output instanceof CNodeTernary) && hasOnlyDataNodeOrLookupInputs(output);
+		return ((output instanceof CNodeUnary 
+				&& !TemplateUtils.isUnary(output, UnaryType.EXP, UnaryType.LOG)) 
+			|| output instanceof CNodeBinary) 
+			&& hasOnlyDataNodeOrLookupInputs(output);
 	}
 	
 	public static boolean hasNoOperation(CNodeTpl tpl) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 614d6e0..182adf4 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -58,6 +58,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME20 = TEST_NAME+"20"; //1 / (1 - (A / rowSums(A)))
 	private static final String TEST_NAME21 = TEST_NAME+"21"; //sum(X/rowSums(X))
 	private static final String TEST_NAME22 = TEST_NAME+"22"; //((7+X)+(X-7)+exp(X))/(rowMins(X)+0.5) 
+	private static final String TEST_NAME23 = TEST_NAME+"23"; //L2SVM outer loop 
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -69,7 +70,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=22; i++)
+		for(int i=1; i<=23; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -403,6 +404,21 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME22, false, ExecType.SPARK );
 	}
 	
+	@Test	
+	public void testCodegenRowAggRewrite23CP() {
+		testCodegenIntegration( TEST_NAME23, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg23CP() {
+		testCodegenIntegration( TEST_NAME23, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg23SP() {
+		testCodegenIntegration( TEST_NAME23, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -430,7 +446,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 			rCmd = getRCmd(inputDir(), expectedDir());			
 
 			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
-
+			
 			runTest(true, false, null, -1); 
 			runRScript(true); 
 			

http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/test/scripts/functions/codegen/rowAggPattern23.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern23.R b/src/test/scripts/functions/codegen/rowAggPattern23.R
new file mode 100644
index 0000000..c0319a6
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern23.R
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,3000), 60, 50);
+Y = seq(1,60);
+Xw = seq(2,61);
+lambda = 7;
+
+out = 1 - Y * Xw
+sv = (out > 0)
+out = sv * out
+obj = 0.5 * sum(out * out)
+g_new = t(X) %*% (out * Y)
+
+R = as.matrix(obj + sum(g_new));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6a4aa1d6/src/test/scripts/functions/codegen/rowAggPattern23.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern23.dml b/src/test/scripts/functions/codegen/rowAggPattern23.dml
new file mode 100644
index 0000000..4aafd0f
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern23.dml
@@ -0,0 +1,37 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,3000), 60, 50);
+Y = seq(1,60);
+Xw = seq(2,61);
+if(1==1){}
+
+out = 1 - Y * Xw
+sv = (out > 0)
+out = sv * out
+obj = 0.5 * sum(out * out)
+g_new = t(X) %*% (out * Y)
+
+if(1==1){}
+R = as.matrix(obj + sum(g_new));
+
+write(R, $1)


[2/2] systemml git commit: [SYSTEMML-1535] Codegen matrix-matrix multiplication support

Posted by mb...@apache.org.
[SYSTEMML-1535] Codegen matrix-matrix multiplication support

This patch generalizes the row-wise code generation template from
matrix-vector to matrix-matrix multiplications, which enables a broad
range of additional fusion opportunities. Examples are Mlogreg and
KMeans with multiple classes or centroids, respectively. The fusion of
matrix-matrix multiplications avoids unnecessary scans of X as well as
large intermediates of size nrow(X) x K. 

On a scenario of KMeans w/ 1 run, 20 iterations, 100M x 10 dense input,
and 5 centroids, this change improved the end-to-end performance from
852s (1360s w/o codegen) to 463s. The major additional benefits come
from fusing (1) -2 * (X %*% t(C)) + t(rowSums (C ^ 2)), and (2) (t(P)
%*% X), which avoid two large intermediates for X %*% t(C) and t(P).

Furthermore, this patch also lays the foundations for more complex dags
with different vector sizes in row-wise templates.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6b25b3bf
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6b25b3bf
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6b25b3bf

Branch: refs/heads/master
Commit: 6b25b3bf2621f13d97c6a3bf3a66a333af834db7
Parents: 6a4aa1d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu Jun 29 22:38:03 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Jun 29 23:14:05 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |   2 +-
 .../apache/sysml/hops/codegen/SpoofFusedOp.java |  15 ++-
 .../apache/sysml/hops/codegen/cplan/CNode.java  |  10 ++
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  79 +++++++++---
 .../hops/codegen/cplan/CNodeOuterProduct.java   |   6 +-
 .../sysml/hops/codegen/cplan/CNodeRow.java      |  58 +++++----
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |   4 +
 .../hops/codegen/template/TemplateRow.java      |  84 +++++++++---
 .../hops/codegen/template/TemplateUtils.java    | 127 +++++++------------
 .../runtime/codegen/LibSpoofPrimitives.java     |  83 ++++++++++--
 .../sysml/runtime/codegen/SpoofCellwise.java    |   4 +-
 .../runtime/codegen/SpoofMultiAggregate.java    |   2 +-
 .../sysml/runtime/codegen/SpoofOperator.java    | 105 +++++++--------
 .../runtime/codegen/SpoofOuterProduct.java      |  16 +--
 .../sysml/runtime/codegen/SpoofRowwise.java     |  77 ++++++-----
 .../instructions/spark/SpoofSPInstruction.java  |   3 +-
 .../spark/data/PartitionedBroadcast.java        |   8 ++
 .../runtime/matrix/data/LibMatrixMult.java      |  33 ++---
 .../functions/codegen/RowAggTmplTest.java       |  66 +++++++++-
 .../scripts/functions/codegen/rowAggPattern24.R |  33 +++++
 .../functions/codegen/rowAggPattern24.dml       |  30 +++++
 .../scripts/functions/codegen/rowAggPattern25.R |  32 +++++
 .../functions/codegen/rowAggPattern25.dml       |  29 +++++
 .../scripts/functions/codegen/rowAggPattern26.R |  32 +++++
 .../functions/codegen/rowAggPattern26.dml       |  28 ++++
 .../scripts/functions/codegen/rowAggPattern27.R |  32 +++++
 .../functions/codegen/rowAggPattern27.dml       |  29 +++++
 27 files changed, 747 insertions(+), 280 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index fede282..5342c09 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -96,7 +96,7 @@ import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Statistics;
 
-public class SpoofCompiler 
+public class SpoofCompiler
 {
 	private static final Log LOG = LogFactory.getLog(SpoofCompiler.class.getName());
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
index 06be99b..0d4b8db 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
@@ -44,8 +44,9 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 		COLUMN_DIMS_COLS,
 		SCALAR,
 		MULTI_SCALAR,
-		ROW_RANK_DIMS, // right wdivmm 
-		COLUMN_RANK_DIMS  // left wdivmm
+		ROW_RANK_DIMS, // right wdivmm, row mm
+		COLUMN_RANK_DIMS,  // left wdivmm, row mm
+		COLUMN_RANK_DIMS_T;
 	}
 	
 	private Class<?> _class = null;
@@ -182,6 +183,12 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 						ret = new long[]{mc.getCols(), mc2.getCols(), -1};
 					break;
 				}
+				case COLUMN_RANK_DIMS_T: {
+					MatrixCharacteristics mc2 = memo.getAllInputStats(getInput().get(1));
+					if( mc2.dimsKnown() )
+						ret = new long[]{mc2.getCols(), mc.getCols(), -1};
+					break;
+				}
 				default:
 					throw new RuntimeException("Failed to infer worst-case size information "
 							+ "for type: "+_dimsType.toString());
@@ -231,6 +238,10 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 				setDim1(getInput().get(0).getDim2());
 				setDim2(getInput().get(1).getDim2());
 				break;
+			case COLUMN_RANK_DIMS_T:
+				setDim1(getInput().get(1).getDim2());
+				setDim2(getInput().get(0).getDim2());
+				break;	
 			default:
 				throw new RuntimeException("Failed to refresh size information "
 						+ "for type: "+_dimsType.toString());

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
index efe468e..1f91697 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
@@ -83,6 +83,16 @@ public abstract class CNode
 		return _genVar;
 	}
 	
+	public String getVectorLength() {
+		if( getVarname().startsWith("a") )
+			return "len";
+		else if( getVarname().startsWith("b") )
+			return getVarname()+".clen";
+		else if( _dataType==DataType.MATRIX )
+			return getVarname()+".length";
+		return "";
+	}
+	
 	public String getClassname() {
 		return getVarname();
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index 8d67f26..4bbf205 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -28,7 +28,8 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public class CNodeBinary extends CNode
 {
 	public enum BinType {
-		DOT_PRODUCT,
+		//matrix multiplication operations
+		DOT_PRODUCT, VECT_MATRIXMULT, VECT_OUTERMULT_ADD,
 		//vector-scalar-add operations
 		VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD, VECT_PLUS_ADD,
 		VECT_POW_ADD, VECT_MIN_ADD, VECT_MAX_ADD,
@@ -71,6 +72,12 @@ public class CNodeBinary extends CNode
 				case DOT_PRODUCT:   
 					return sparse ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen);\n" :
 									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+				case VECT_MATRIXMULT:   
+					return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" :
+									"    double[] %TMP% = LibSpoofPrimitives.vectMatrixMult(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+				case VECT_OUTERMULT_ADD:   
+					return sparse ? "    LibSpoofPrimitives.vectOuterMultAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POS2%, %POSOUT%, alen, %LEN1%, %LEN2%);\n" :
+									"    LibSpoofPrimitives.vectOuterMultAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POS2%, %POSOUT%, %LEN1%, %LEN2%);\n";
 				
 				//vector-scalar-add operations
 				case VECT_MULT_ADD:
@@ -88,10 +95,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_ADD: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, len);\n" : 
+						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
 					else	
-						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, len);\n" : 
+						return sparse ? "    LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, alen, %LEN%);\n" : 
 										"    LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
 				}
 				
@@ -111,10 +118,10 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL_SCALAR: {
 					String vectName = getVectorPrimitiveName();
 					if( scalarVector )
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, len);\n" : 
+						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2v%, %IN2i%, %POS2%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS2%, %LEN%);\n";
 					else	
-						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, len);\n" : 
+						return sparse ? "    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, alen, %LEN%);\n" : 
 										"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %LEN%);\n";
 				}
 				
@@ -133,7 +140,7 @@ public class CNodeBinary extends CNode
 				case VECT_GREATEREQUAL: {
 					String vectName = getVectorPrimitiveName();
 					return sparse ? 
-						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, len);\n" : 
+						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, alen, %LEN%);\n" : 
 						"    double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
 				}
 				
@@ -185,7 +192,8 @@ public class CNodeBinary extends CNode
 		}
 		public boolean isVectorPrimitive() {
 			return isVectorScalarPrimitive() 
-				|| isVectorVectorPrimitive();
+				|| isVectorVectorPrimitive()
+				|| isVectorMatrixPrimitive();
 		}
 		public boolean isVectorScalarPrimitive() {
 			return this == VECT_DIV_SCALAR || this == VECT_MULT_SCALAR 
@@ -204,6 +212,10 @@ public class CNodeBinary extends CNode
 				|| this == VECT_LESS || this == VECT_LESSEQUAL
 				|| this == VECT_GREATER || this == VECT_GREATEREQUAL;
 		}
+		public boolean isVectorMatrixPrimitive() {
+			return this == VECT_MATRIXMULT
+				|| this == VECT_OUTERMULT_ADD;
+		}
 		public BinType getVectorAddPrimitive() {
 			return BinType.valueOf("VECT_"+getVectorPrimitiveName().toUpperCase()+"_ADD");
 		}
@@ -257,19 +269,32 @@ public class CNodeBinary extends CNode
 		tmp = tmp.replace("%TMP%", var);
 		
 		//replace input references and start indexes
-		for( int j=1; j<=2; j++ ) {
-			String varj = _inputs.get(j-1).getVarname();
+		for( int j=0; j<2; j++ ) {
+			String varj = _inputs.get(j).getVarname();
 			
 			//replace sparse and dense inputs
-			tmp = tmp.replace("%IN"+j+"v%", varj+"vals");
-			tmp = tmp.replace("%IN"+j+"i%", varj+"ix");
-			tmp = tmp.replace("%IN"+j+"%", varj );
+			tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals");
+			tmp = tmp.replace("%IN"+(j+1)+"i%", varj+"ix");
+			tmp = tmp.replace("%IN"+(j+1)+"%", 
+				varj.startsWith("b") ? varj + ".ddat" : varj );
 			
 			//replace start position of main input
-			tmp = tmp.replace("%POS"+j+"%", (_inputs.get(j-1) instanceof CNodeData 
-				&& _inputs.get(j-1).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? 
-				varj+"i" : TemplateUtils.isMatrix(_inputs.get(j-1)) ? "rowIndex*len" : "0" : "0");
+			tmp = tmp.replace("%POS"+(j+1)+"%", (_inputs.get(j) instanceof CNodeData 
+				&& _inputs.get(j).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? varj+"i" : 
+				(TemplateUtils.isMatrix(_inputs.get(j)) && _type!=BinType.VECT_MATRIXMULT) ? 
+				"rowIndex*"+((_type==BinType.VECT_OUTERMULT_ADD)?"%LEN"+(j+1)+"%":"%LEN%") : "0" : "0");
+		}
+		//replace length information (e.g., after matrix mult)
+		if( _type == BinType.VECT_OUTERMULT_ADD ) {
+			for( int j=0; j<2; j++ )
+				tmp = tmp.replace("%LEN"+(j+1)+"%", _inputs.get(j).getVectorLength());
+		}
+		else { //general case 
+			CNode mInput = getIntermediateInputVector();
+			if( mInput != null )
+				tmp = tmp.replace("%LEN%", mInput.getVectorLength());
 		}
+		
 		sb.append(tmp);
 		
 		//mark as generated
@@ -278,10 +303,19 @@ public class CNodeBinary extends CNode
 		return sb.toString();
 	}
 	
+	private CNode getIntermediateInputVector() {
+		for( int i=0; i<2; i++ )
+			if( getInput().get(i).getDataType().isMatrix() )
+				return getInput().get(i);
+		return null;
+	} 
+	
 	@Override
 	public String toString() {
 		switch(_type) {
 			case DOT_PRODUCT:              return "b(dot)";
+			case VECT_MATRIXMULT:          return "b(vmm)";
+			case VECT_OUTERMULT_ADD:       return "b(voma)";
 			case VECT_MULT_ADD:            return "b(vma)";
 			case VECT_DIV_ADD:             return "b(vda)";
 			case VECT_MINUS_ADD:           return "b(vmia)";
@@ -362,7 +396,13 @@ public class CNodeBinary extends CNode
 				boolean vectorScalar = _inputs.get(1).getDataType()==DataType.SCALAR;
 				_rows = _inputs.get(vectorScalar ? 0 : 1)._rows;
 				_cols = _inputs.get(vectorScalar ? 0 : 1)._cols;
-				_dataType= DataType.MATRIX;
+				_dataType = DataType.MATRIX;
+				break;
+			
+			case VECT_OUTERMULT_ADD:
+				_rows = _inputs.get(0)._cols;
+				_cols = _inputs.get(1)._cols;
+				_dataType = DataType.MATRIX;
 				break;
 				
 			case VECT_DIV_SCALAR: 	
@@ -396,8 +436,13 @@ public class CNodeBinary extends CNode
 				_cols = _inputs.get(scalarVector ? 1 : 0)._cols;
 				_dataType= DataType.MATRIX;
 				break;
+			
+			case VECT_MATRIXMULT:
+				_rows = _inputs.get(0)._rows;
+				_cols = _inputs.get(1)._cols;
+				_dataType = DataType.MATRIX;
+				break;
 				
-		
 			case DOT_PRODUCT: 
 			
 			//SCALAR Arithmetic

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
index d6a1d34..01ca08e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
@@ -39,10 +39,10 @@ public class CNodeOuterProduct extends CNodeTpl
 			+ "  public %TMP%() {\n"
 			+ "    _outerProductType = OutProdType.%TYPE%;\n"
 			+ "  }\n"
-			+ "  protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int len, int rowIndex, int colIndex) { \n"
 			+ "%BODY_dense%"
 			+ "  }\n"
-			+ "  protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int len, int rowIndex, int colIndex) { \n"
 			+ "%BODY_cellwise%"
 			+ "    return %OUT_cellwise%;\n"
 			+ "  }\n"			
@@ -86,7 +86,7 @@ public class CNodeOuterProduct extends CNodeTpl
 			tmp = tmp.replace("%OUT_cellwise%", getCurrentVarName());
 		}
 		//replace size information
-		tmp = tmp.replace("%LEN%", "k");
+		tmp = tmp.replace("%LEN%", "len");
 		
 		tmp = tmp.replace("%POSOUT%", "ci");
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
index 7cba5f7..b74b79d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
@@ -22,6 +22,7 @@ package org.apache.sysml.hops.codegen.cplan;
 import java.util.ArrayList;
 
 import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.TemplateUtils;
 import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
@@ -32,25 +33,26 @@ public class CNodeRow extends CNodeTpl
 	private static final String TEMPLATE = 
 			  "package codegen;\n"
 			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofOperator.SideInput;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise;\n"
 			+ "import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;\n"
 			+ "import org.apache.commons.math3.util.FastMath;\n"
 			+ "\n"
 			+ "public final class %TMP% extends SpoofRowwise { \n"
 			+ "  public %TMP%() {\n"
-			+ "    super(RowType.%TYPE%, %CBIND0%, %VECT_MEM%);\n"
+			+ "    super(RowType.%TYPE%, %CBIND0%, %TB1%, %VECT_MEM%);\n"
 			+ "  }\n"
-			+ "  protected void genexec(double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex) { \n"
+			+ "  protected void genexec(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int len, int rowIndex) { \n"
 			+ "%BODY_dense%"
 			+ "  }\n"
-			+ "  protected void genexec(double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int alen, int len, int rowIndex) { \n"
+			+ "  protected void genexec(double[] avals, int[] aix, int ai, SideInput[] b, double[] scalars, double[] c, int alen, int len, int rowIndex) { \n"
 			+ "%BODY_sparse%"
 			+ "  }\n"			
 			+ "}\n";
 
 	private static final String TEMPLATE_ROWAGG_OUT  = "    c[rowIndex] = %IN%;\n";
 	private static final String TEMPLATE_FULLAGG_OUT = "    c[0] += %IN%;\n";
-	private static final String TEMPLATE_NOAGG_OUT   = "    LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*len, len);\n";
+	private static final String TEMPLATE_NOAGG_OUT   = "    LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*%LEN%, %LEN%);\n";
 	
 	public CNodeRow(ArrayList<CNode> inputs, CNode output ) {
 		super(inputs, output);
@@ -59,14 +61,6 @@ public class CNodeRow extends CNodeTpl
 	private RowType _type = null; //access pattern 
 	private int _numVectors = -1; //number of intermediate vectors
 	
-	public void setNumVectorIntermediates(int num) {
-		_numVectors = num;
-	}
-	
-	public int getNumVectorIntermediates() {
-		return _numVectors;
-	}
-	
 	public void setRowType(RowType type) {
 		_type = type;
 		_hash = 0;
@@ -76,6 +70,15 @@ public class CNodeRow extends CNodeTpl
 		return _type;
 	}
 	
+	public void setNumVectorIntermediates(int num) {
+		_numVectors = num;
+		_hash = 0;
+	}
+	
+	public int getNumVectorIntermediates() {
+		return _numVectors;
+	}
+	
 	@Override
 	public void renameInputs() {
 		rRenameDataNode(_output, _inputs.get(0), "a"); // input matrix
@@ -108,18 +111,26 @@ public class CNodeRow extends CNodeTpl
 		tmp = tmp.replace("%TYPE%", _type.name());
 		tmp = tmp.replace("%CBIND0%", String.valueOf(
 			TemplateUtils.isUnary(_output, UnaryType.CBIND0)));
+		tmp = tmp.replace("%TB1%", String.valueOf(
+			TemplateUtils.containsBinary(_output, BinType.VECT_MATRIXMULT)));
 		tmp = tmp.replace("%VECT_MEM%", String.valueOf(_numVectors));
 		
 		return tmp;
 	}
 	
 	private String getOutputStatement(String varName) {
-		if( !_type.isColumnAgg() ) {
-			String tmp = (_type==RowType.NO_AGG) ? TEMPLATE_NOAGG_OUT : 
-				(_type==RowType.FULL_AGG) ? TEMPLATE_FULLAGG_OUT : TEMPLATE_ROWAGG_OUT;
-			return tmp.replace("%IN%", varName);
+		switch( _type ) {
+			case NO_AGG:
+			case NO_AGG_B1:
+				return TEMPLATE_NOAGG_OUT.replace("%IN%", varName)
+					.replace("%LEN%", _output.getVarname()+".length");
+			case FULL_AGG:
+				return TEMPLATE_FULLAGG_OUT.replace("%IN%", varName);
+			case ROW_AGG:
+				return TEMPLATE_ROWAGG_OUT.replace("%IN%", varName);
+			default:
+				return ""; //_type.isColumnAgg()
 		}
-		return "";
 	}
 
 	@Override
@@ -131,12 +142,15 @@ public class CNodeRow extends CNodeTpl
 	@Override
 	public SpoofOutputDimsType getOutputDimType() {
 		switch( _type ) {
-			case NO_AGG: return SpoofOutputDimsType.INPUT_DIMS;
-			case FULL_AGG: return SpoofOutputDimsType.SCALAR;
-			case ROW_AGG: return TemplateUtils.isUnary(_output, UnaryType.CBIND0) ?
-				SpoofOutputDimsType.ROW_DIMS2 : SpoofOutputDimsType.ROW_DIMS;
-			case COL_AGG: return SpoofOutputDimsType.COLUMN_DIMS_COLS; //row vector
+			case NO_AGG:    return SpoofOutputDimsType.INPUT_DIMS;
+			case NO_AGG_B1: return SpoofOutputDimsType.ROW_RANK_DIMS;
+			case FULL_AGG:  return SpoofOutputDimsType.SCALAR;
+			case ROW_AGG:   return TemplateUtils.isUnary(_output, UnaryType.CBIND0) ?
+						SpoofOutputDimsType.ROW_DIMS2 : SpoofOutputDimsType.ROW_DIMS;
+			case COL_AGG:   return SpoofOutputDimsType.COLUMN_DIMS_COLS; //row vector
 			case COL_AGG_T: return SpoofOutputDimsType.COLUMN_DIMS_ROWS; //column vector
+			case COL_AGG_B1:   return SpoofOutputDimsType.COLUMN_RANK_DIMS; 
+			case COL_AGG_B1_T: return SpoofOutputDimsType.COLUMN_RANK_DIMS_T; 
 			default:
 				throw new RuntimeException("Unsupported row type: "+_type.toString());
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 500b309..85800b8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -185,6 +185,10 @@ public class CNodeUnary extends CNode
 		tmp = tmp.replace("%POS1%", spos);
 		tmp = tmp.replace("%POS2%", spos);
 		
+		//replace length
+		if( _inputs.get(0).getDataType().isMatrix() )
+			tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength());
+		
 		sb.append(tmp);
 		
 		//mark as generated

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 601d664..c0c8c4e 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -50,6 +50,7 @@ import org.apache.sysml.hops.Hop.Direction;
 import org.apache.sysml.hops.Hop.OpOp1;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysml.runtime.matrix.data.Pair;
 
 public class TemplateRow extends TemplateBase 
@@ -73,8 +74,17 @@ public class TemplateRow extends TemplateBase
 	public boolean open(Hop hop) {
 		return (hop instanceof BinaryOp && hop.dimsKnown() && isValidBinaryOperation(hop)
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
-			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && hop.getDim2()==1
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && hop.getDim2()==1 //MV
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1)
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && LibMatrixMult.isSkinnyRightHandSide(
+				hop.getInput().get(0).getDim1(), hop.getInput().get(0).getDim2(), //MM
+				hop.getInput().get(1).getDim1(), hop.getInput().get(1).getDim2())
+				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1
+				&& !HopRewriteUtils.isOuterProductLikeMM(hop))
+			|| (HopRewriteUtils.isTransposeOperation(hop) && hop.getParent().size()==1
+				&& hop.getParent().get(0) instanceof AggBinaryOp && hop.getParent().get(0).dimsKnown()
+				&& hop.getParent().get(0).getInput().indexOf(hop) == 0
+				&& isFuseSkinnyMatrixMult(hop.getParent().get(0)))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
 				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1
 				&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG));
@@ -88,20 +98,24 @@ public class TemplateRow extends TemplateBase
 				&& input.getDim2()==1 && hop.getInput().get(1).getDim2()==1
 				&& HopRewriteUtils.isEmpty(hop.getInput().get(1)))
 			|| ((hop instanceof UnaryOp || hop instanceof ParameterizedBuiltinOp) 
-					&& TemplateCell.isValidOperation(hop))		
+					&& TemplateCell.isValidOperation(hop))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection()!=Direction.RowCol
 				&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG))
 			|| (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getDirection() == Direction.RowCol 
 				&& ((AggUnaryOp)hop).getOp() == AggOp.SUM )
-			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 && hop.getDim2()==1
-				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+			|| (hop instanceof AggBinaryOp && hop.getDim1()>1 && hop.getDim2()==1 //MV
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)))
+			|| (hop instanceof AggBinaryOp && hop.dimsKnown() && isFuseSkinnyMatrixMult(hop) //MM
+				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))
+				&& hop.getInput().get(0).getDim1()>1 && hop.getInput().get(0).getDim2()>1));
 	}
 
 	@Override
 	public boolean merge(Hop hop, Hop input) {
 		//merge rowagg tpl with cell tpl if input is a vector
 		return !isClosed() &&
-			((hop instanceof BinaryOp && isValidBinaryOperation(hop))
+			((hop instanceof BinaryOp && isValidBinaryOperation(hop)
+				&& hop.getDim1() > 1 && input.getDim1()>1) 
 			 ||(hop instanceof AggBinaryOp && input.getDim2()==1
 				&& HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
 	}
@@ -117,12 +131,18 @@ public class TemplateRow extends TemplateBase
 			return CloseType.OPEN;
 	}
 	
-	private boolean isValidBinaryOperation(Hop hop) {
-		//exclude unsupported and matrix-rowvector ops
-		return TemplateUtils.isOperationSupported(hop)
-			&& (HopRewriteUtils.isBinaryMatrixScalarOperation(hop)
-			|| HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
-			|| HopRewriteUtils.isBinaryMatrixMatrixOperation(hop));
+	private static boolean isValidBinaryOperation(Hop hop) {
+		//support for matrix-scalar, matrix-col_vector,
+		//matrix-row_vector, and matrix-matrix
+		return TemplateUtils.isOperationSupported(hop);
+	}
+	
+	private static boolean isFuseSkinnyMatrixMult(Hop hop) {
+		//check for fusable but not opening matrix multiply (vect_outer-mult)
+		Hop in1 = hop.getInput().get(0); //transpose
+		Hop in2 = hop.getInput().get(1);
+		return LibMatrixMult.isSkinnyRightHandSide(in1.getDim2(), in1.getDim1(), hop.getDim1(), hop.getDim2())
+			|| LibMatrixMult.isSkinnyRightHandSide(in2.getDim1(), in2.getDim2(), hop.getDim2(), hop.getDim1());
 	}
 
 	@Override
@@ -138,7 +158,7 @@ public class TemplateRow extends TemplateBase
 		//reorder inputs (ensure matrix is first input, and other inputs ordered by size)
 		Hop[] sinHops = inHops.stream()
 			.filter(h -> !(h.getDataType().isScalar() && tmp.get(h.getHopID()).isLiteral()))
-			.sorted(new HopInputComparator(inHops2.get("X"))).toArray(Hop[]::new);
+			.sorted(new HopInputComparator(inHops2.get("X"),inHops2.get("B1"))).toArray(Hop[]::new);
 		
 		//construct template node
 		ArrayList<CNode> inputs = new ArrayList<CNode>();
@@ -146,7 +166,8 @@ public class TemplateRow extends TemplateBase
 			inputs.add(tmp.get(in.getHopID()));
 		CNode output = tmp.get(hop.getHopID());
 		CNodeRow tpl = new CNodeRow(inputs, output);
-		tpl.setRowType(TemplateUtils.getRowType(hop, sinHops[0]));
+		tpl.setRowType(TemplateUtils.getRowType(hop, 
+			inHops2.get("X"), inHops2.get("B1")));
 		tpl.setNumVectorIntermediates(TemplateUtils
 			.determineMinVectorIntermediates(output));
 		tpl.getOutput().resetVisitStatus();
@@ -217,7 +238,13 @@ public class TemplateRow extends TemplateBase
 				inHops.add(hop.getInput().get(0).getInput().get(0));
 				
 				//note: vectorMultAdd applicable to vector-scalar, and vector-vector
-				out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+				if( hop.getInput().get(1).getDim2() == 1 )
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+				else {
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_OUTERMULT_ADD);
+					if( !inHops2.containsKey("B1") )
+						inHops2.put("B1", hop.getInput().get(1));
+				}
 				inHops2.put("X", hop.getInput().get(0).getInput().get(0));
 			}
 			else
@@ -225,12 +252,24 @@ public class TemplateRow extends TemplateBase
 				if(hop.getInput().get(0).getDim2()==1 && hop.getInput().get(1).getDim2()==1)
 					out = new CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new CNodeUnary(cdata1, UnaryType.LOOKUP0),
 						(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, UnaryType.LOOKUP0), BinType.MULT);
-				else {
+				else if( hop.getInput().get(1).getDim2()==1 ) {
 					out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
 					inHops2.put("X", hop.getInput().get(0));
 				}
+				else {
+					out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MATRIXMULT);
+					inHops2.put("X", hop.getInput().get(0));
+					inHops2.put("B1", hop.getInput().get(1));
+				}
 			}
 		}
+		else if( HopRewriteUtils.isTransposeOperation(hop) ) 
+		{
+			out = TemplateUtils.skipTranspose(tmp.get(hop.getHopID()), 
+				hop, tmp, compileLiterals);
+			if( out instanceof CNodeData && !inHops.contains(hop.getInput().get(0)) )
+				inHops.add(hop.getInput().get(0));
+		}
 		else if(hop instanceof UnaryOp)
 		{
 			CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID());
@@ -272,7 +311,8 @@ public class TemplateRow extends TemplateBase
 				|| (hop.getInput().get(1).getDim1() > 1 && hop.getInput().get(1).getDim2() > 1))
 			{
 				if( HopRewriteUtils.isBinary(hop, SUPPORTED_VECT_BINARY) ) {
-					if( TemplateUtils.isMatrix(cdata1) && TemplateUtils.isMatrix(cdata2) ) {
+					if( TemplateUtils.isMatrix(cdata1) && (TemplateUtils.isMatrix(cdata2) 
+							|| TemplateUtils.isRowVector(cdata2)) ) {
 						String opname = "VECT_"+((BinaryOp)hop).getOp().name();
 						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(opname));
 					}
@@ -360,19 +400,21 @@ public class TemplateRow extends TemplateBase
 	public static class HopInputComparator implements Comparator<Hop> 
 	{
 		private final Hop _X;
+		private final Hop _B1;
 		
-		public HopInputComparator(Hop X) {
+		public HopInputComparator(Hop X, Hop B1) {
 			_X = X;
+			_B1 = B1;
 		}
 		
 		@Override
 		public int compare(Hop h1, Hop h2) {
 			long ncells1 = h1.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h1==_X) ? Long.MAX_VALUE : 
-				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-1;
+				(h1==_X) ? Long.MAX_VALUE : (h1==_B1) ? Long.MAX_VALUE-1 : 
+				h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : Long.MAX_VALUE-2;
 			long ncells2 = h2.getDataType()==DataType.SCALAR ? Long.MIN_VALUE : 
-				(h2==_X) ? Long.MAX_VALUE : 
-				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-1;
+				(h2==_X) ? Long.MAX_VALUE : (h2==_B1) ? Long.MAX_VALUE-1 : 
+				h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : Long.MAX_VALUE-2;
 			return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 1 : 0; 
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index da803cd..4bd5bf1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -19,11 +19,7 @@
 
 package org.apache.sysml.hops.codegen.template;
 
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
 
 import org.apache.commons.lang.ArrayUtils;
 import org.apache.sysml.hops.AggBinaryOp;
@@ -142,74 +138,7 @@ public class TemplateUtils
 			return TernaryType.contains(((ParameterizedBuiltinOp)h).getOp().name());
 		return false;
 	}
-
-	private static void rfindChildren(Hop hop, HashSet<Hop> children ) {		
-		if( hop instanceof UnaryOp || (hop instanceof BinaryOp && hop.getInput().get(0).getDataType() == DataType.MATRIX  &&  TemplateUtils.isVectorOrScalar( hop.getInput().get(1))) || (hop instanceof BinaryOp && TemplateUtils.isVectorOrScalar( hop.getInput().get(0))  &&  hop.getInput().get(1).getDataType() == DataType.MATRIX)    //unary operation or binary operaiton with one matrix and a scalar
-					&& 	hop.getDataType() == DataType.MATRIX )
-		{	
-			if(!children.contains(hop))
-				children.add(hop);
-			Hop matrix = TemplateUtils.isMatrix(hop.getInput().get(0)) ? hop.getInput().get(0) : hop.getInput().get(1);
-			rfindChildren(matrix,children);
-		}
-		else 
-			children.add(hop);
-	}
 	
-	private static Hop findCommonChild(Hop hop1, Hop hop2) {
-		//this method assumes that each two nodes have at most one common child 
-		LinkedHashSet<Hop> children1 = new LinkedHashSet<Hop>();
-		LinkedHashSet<Hop> children2 = new LinkedHashSet<Hop>();
-		
-		rfindChildren(hop1, children1 );
-		rfindChildren(hop2, children2 );
-		
-		//iterate on one set and find the first common child in the other set
-		Iterator<Hop> iter = children1.iterator();
-		while (iter.hasNext()) {
-			Hop candidate = iter.next();
-			if(children2.contains(candidate))
-				return candidate;
-		}
-		return null;
-	}
-	
-	public static Hop commonChild(ArrayList<Hop> _adddedMatrices, Hop input) {
-		Hop currentChild = null;
-		//loop on every added matrix and find its common child with the input, if all of them have the same common child then return it, otherwise null 
-		for(Hop addedMatrix : _adddedMatrices)
-		{
-			Hop child = findCommonChild(addedMatrix,input);
-			if(child == null)  // did not find a common child
-				return null;
-			if(currentChild == null) // first common child to be seen
-				currentChild = child;
-			else if(child.getHopID() != currentChild.getHopID())
-				return null;
-		}
-		return currentChild;
-	}
-
-	public static HashSet<Long> rGetInputHopIDs( CNode node, HashSet<Long> ids ) {
-		if( node instanceof CNodeData && !node.isLiteral() )
-			ids.add(((CNodeData)node).getHopID());
-		
-		for( CNode c : node.getInput() )
-			rGetInputHopIDs(c, ids);
-			
-		return ids;
-	}
-	
-	public static Hop[] mergeDistinct(HashSet<Long> ids, Hop[] input1, Hop[] input2) {
-		Hop[] ret = new Hop[ids.size()];
-		int pos = 0;
-		for( Hop[] input : new Hop[][]{input1, input2} )
-			for( Hop c : input )
-				if( ids.contains(c.getHopID()) )
-					ret[pos++] = c; 
-		return ret;
-	}
-
 	public static TemplateBase createTemplate(TemplateType type) {
 		return createTemplate(type, false);
 	}
@@ -242,21 +171,31 @@ public class TemplateUtils
 			CellType.FULL_AGG : CellType.ROW_AGG) : CellType.NO_AGG;
 	}
 	
-	public static RowType getRowType(Hop output, Hop input) {
-		if( HopRewriteUtils.isEqualSize(output, input) )
+	public static RowType getRowType(Hop output, Hop... inputs) {
+		Hop X = inputs[0];
+		Hop B1 = (inputs.length>1) ? inputs[1] : null;
+		if( HopRewriteUtils.isEqualSize(output, X) )
 			return RowType.NO_AGG;
-		else if( output.getDim1()==input.getDim1() && (output.getDim2()==1 
+		else if( B1 != null && output.getDim1()==X.getDim1() && output.getDim2()==B1.getDim2() )
+			return RowType.NO_AGG_B1;
+		else if( output.getDim1()==X.getDim1() && (output.getDim2()==1 
 				|| HopRewriteUtils.isBinary(output, OpOp2.CBIND)) 
 			&& !(output instanceof AggBinaryOp && HopRewriteUtils
-				.isTransposeOfItself(output.getInput().get(0),input)))
+				.isTransposeOfItself(output.getInput().get(0),X)))
 			return RowType.ROW_AGG;
 		else if( output instanceof AggUnaryOp 
 			&& ((AggUnaryOp)output).getDirection()==Direction.RowCol )
 			return RowType.FULL_AGG;
-		else if( output.getDim1()==input.getDim2() && output.getDim2()==1 )
+		else if( output.getDim1()==X.getDim2() && output.getDim2()==1 )
 			return RowType.COL_AGG_T;
-		else
+		else if( output.getDim1()==1 && output.getDim2()==X.getDim2() )
 			return RowType.COL_AGG;
+		else if( B1 != null && output.getDim1()==X.getDim2() && output.getDim2()==B1.getDim2() )
+			return RowType.COL_AGG_B1_T;
+		else if( B1 != null && output.getDim1()==B1.getDim2() && output.getDim2()==X.getDim2())
+			return RowType.COL_AGG_B1;
+		else
+			throw new RuntimeException("Unknown row type.");
 	}
 	
 	public static AggOp getAggOp(Hop hop) {
@@ -293,6 +232,11 @@ public class TemplateUtils
 			&& ArrayUtils.contains(types, ((CNodeUnary)node).getType());
 	}
 	
+	public static boolean isBinary(CNode node, BinType...types) {
+		return node instanceof CNodeBinary
+			&& ArrayUtils.contains(types, ((CNodeBinary)node).getType());
+	}
+	
 	public static boolean isTernary(CNode node, TernaryType...types) {
 		return node instanceof CNodeTernary
 			&& ArrayUtils.contains(types, ((CNodeTernary)node).getType());
@@ -333,7 +277,8 @@ public class TemplateUtils
 		CNode output = tpl.getOutput();
 		return ((output instanceof CNodeUnary 
 				&& !TemplateUtils.isUnary(output, UnaryType.EXP, UnaryType.LOG)) 
-			|| output instanceof CNodeBinary) 
+			|| (output instanceof CNodeBinary
+				&& !TemplateUtils.isBinary(output, BinType.VECT_OUTERMULT_ADD))) 
 			&& hasOnlyDataNodeOrLookupInputs(output);
 	}
 	
@@ -365,8 +310,7 @@ public class TemplateUtils
 	public static boolean isUnaryOperatorPipeline(CNode node) {
 		if( node.isVisited() ) {
 			//second reference to vector intermediate invalidates a unary pipeline
-			return !((node instanceof CNodeBinary && ((CNodeBinary)node).getType().isVectorPrimitive())
-				|| (node instanceof CNodeUnary && ((CNodeUnary)node).getType().isVectorScalarPrimitive()));
+			return !(node instanceof CNodeBinary && ((CNodeBinary)node).getType().isVectorPrimitive());
 		}
 		boolean ret = true;
 		for( CNode input : node.getInput() )
@@ -382,8 +326,9 @@ public class TemplateUtils
 		for( CNode input : node.getInput() )
 			max = Math.max(max, getMaxVectorIntermediates(input));
 		max = Math.max(max, (node instanceof CNodeBinary)? 
-			((CNodeBinary)node).getType().isVectorVectorPrimitive() ? 3 :
-			((CNodeBinary)node).getType().isVectorScalarPrimitive() ? 2 : 0 : 0);
+			(((CNodeBinary)node).getType().isVectorVectorPrimitive() ? 3 :
+			((CNodeBinary)node).getType().isVectorScalarPrimitive() ? 2 :
+			((CNodeBinary)node).getType().isVectorMatrixPrimitive() ? 1 : 0) : 0);
 		max = Math.max(max, (node instanceof CNodeUnary 
 			&& ((CNodeUnary)node).getType().isVectorScalarPrimitive()) ? 2 : 0);
 		node.setVisited();
@@ -432,4 +377,22 @@ public class TemplateUtils
 		}
 		return ret;
 	}
+	
+	public static boolean containsBinary(CNode node, BinType type) {
+		node.resetVisitStatus();
+		boolean ret = rContainsBinary(node, type);
+		node.resetVisitStatus();
+		return ret;
+	}
+	
+	public static boolean rContainsBinary(CNode node, BinType type) {
+		if( node.isVisited() )
+			return false;
+		boolean ret = false;
+		for( CNode input : node.getInput() )
+			ret |= rContainsBinary(input, type);
+		ret |= isBinary(node, type);
+		node.setVisited();
+		return ret;
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index ad2530d..1108c08 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -20,6 +20,7 @@
 package org.apache.sysml.runtime.codegen;
 
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.LinkedList;
 
 import org.apache.commons.math3.util.FastMath;
@@ -57,6 +58,50 @@ public class LibSpoofPrimitives
 		return LibMatrixMult.dotProduct(a, b, aix, ai, bi, len);
 	}
 	
+	public static double[] vectMatrixMult(double[] a, double[] b, int ai, int bi, int len) {
+		//note: assumption b is already transposed for efficient dot products
+		int m2clen = b.length / len;
+		double[] c = allocVector(m2clen, false);
+		for( int j = 0, bix = bi; j < m2clen; j++, bix+=len )
+			c[j] = LibMatrixMult.dotProduct(a, b, ai, bix, len);
+		return c;
+	}
+	
+	public static double[] vectMatrixMult(double[] a, double[] b, int[] aix, int ai, int bi, int alen, int len) {
+		//note: assumption b is already transposed for efficient dot products
+		int m2clen = b.length / len;
+		double[] c = allocVector(m2clen, false);
+		for( int j = 0, bix = bi; j < m2clen; j++, bix+=len )
+			c[j] = LibMatrixMult.dotProduct(a, b, aix, ai, bix, alen);
+		return c;
+	}
+	
+	public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int ai, int bi, int ci, int len1, int len2) {
+		//rest, not aligned to 4-blocks
+		final int bn = len1%4;
+		for( int i=0, cix=ci; i < bn; i++, cix+=len2 )
+			if( a[ai+i] != 0 )
+				LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, cix, len2);
+		
+		//unrolled 4-block (for fewer L1-dcache loads)
+		for( int i=bn, cix=ci+bn*len2; i < len1; i+=4, cix+=4*len2 ) {
+			final int cix1=cix, cix2=cix+len2, cix3=cix+2*len2, cix4=cix+3*len2;
+			final double aval1=a[ai+i], aval2=a[ai+i+1], aval3=a[ai+i+2], aval4=a[ai+i+3];
+			for( int j=0; j<len2; j++ ) {
+				final double bval = b[bi+j];
+				c[cix1 + j] += aval1 * bval;
+				c[cix2 + j] += aval2 * bval;
+				c[cix3 + j] += aval3 * bval;
+				c[cix4 + j] += aval4 * bval;
+			}
+		}	
+	}
+	
+	public static void vectOuterMultAdd(double[] a, double[] b, double[] c, int[] aix, int ai, int bi, int ci, int alen, int len1, int len2) {
+		for( int i=0; i < alen; i++ )
+			LibMatrixMult.vectMultiplyAdd(a[ai+i], b, c, bi, ci+aix[ai+i]*len2, len2);
+	}
+	
 	public static void vectMultAdd(double[] a, double bval, double[] c, int bi, int ci, int len) {
 		if( a == null || bval == 0 ) return;
 		LibMatrixMult.vectMultiplyAdd(bval, a, c, bi, ci, len);
@@ -1227,7 +1272,14 @@ public class LibSpoofPrimitives
 	//dynamic memory management
 	
 	public static void setupThreadLocalMemory(int numVectors, int len) {
+		setupThreadLocalMemory(numVectors, len, -1);
+	}
+	
+	public static void setupThreadLocalMemory(int numVectors, int len, int len2) {
 		LinkedList<double[]> list = new LinkedList<double[]>();
+		if( len2 >= 0 ) 
+			for( int i=0; i<numVectors; i++ )
+				list.addLast(new double[len2]);
 		for( int i=0; i<numVectors; i++ )
 			list.addLast(new double[len]);
 		memPool.set(list);
@@ -1242,24 +1294,29 @@ public class LibSpoofPrimitives
 	}
 	
 	private static double[] allocVector(int len, boolean reset, double resetVal) {
-		LinkedList<double[]> list = memPool.get();
+		LinkedList<double[]> list = memPool.get(); 
 		
-		//sanity check for missing setup
-		if( list.isEmpty() ) {
-			double[] tmp = new double[len];
-			if( reset && resetVal != 0 )
-				Arrays.fill(tmp, resetVal);
-			return tmp;
+		//find and remove vector with matching len 
+		double[] vect = null;
+		Iterator<double[]> iter = list.iterator();
+		while( iter.hasNext() ) {
+			double[] tmp = iter.next();
+			if( tmp.length == len ) {
+				vect = tmp;
+				iter.remove();
+				break;
+			}
 		}
 		
-		//get and re-queue first entry
-		double[] tmp = list.removeFirst();
-		list.addLast(tmp);
+		//allocate new vector or re-queue if required
+		if( vect == null )
+			vect = new double[len];
+		else 
+			list.addLast(vect);
 		
 		//reset vector if required
 		if( reset )
-			Arrays.fill(tmp, resetVal);
-		return tmp;
+			Arrays.fill(vect, resetVal);
+		return vect;
 	}
 }
-

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
index cc8ef69..15de508 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -118,7 +118,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		}
 		
 		//input preparation
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
@@ -198,7 +198,7 @@ public abstract class SpoofCellwise extends SpoofOperator implements Serializabl
 		
 		//input preparation
 		MatrixBlock a = inputs.get(0);
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = a.getNumRows();
 		final int n = a.getNumColumns();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index e7e3b54..c3755d4 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -91,7 +91,7 @@ public abstract class SpoofMultiAggregate extends SpoofOperator implements Seria
 		setInitialOutputValues(c);
 		
 		//input preparation
-		SideInput[] b = prepInputMatricesAbstract(inputs);
+		SideInput[] b = prepInputMatrices(inputs);
 		double[] scalars = prepInputScalars(scalarObjects);
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
index d3bf410..9561fcb 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.util.DataConverter;
 import org.apache.sysml.runtime.util.UtilFunctions;
@@ -59,72 +60,62 @@ public abstract class SpoofOperator implements Serializable
 		return execute(inputs, scalars);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesDense(inputs, 1, inputs.size()-1);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, false, false);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs, int offset) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesDense(inputs, offset, inputs.size()-offset);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, boolean denseOnly) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, denseOnly, false);
 	}
 	
-	protected double[][] prepInputMatricesDense(ArrayList<MatrixBlock> inputs, int offset, int len) 
-		throws DMLRuntimeException 
-	{
-		double[][] b = new double[len][]; 
-		for(int i=offset; i<offset+len; i++) {
-			if( inputs.get(i) instanceof CompressedMatrixBlock ) 
-				inputs.set(i, ((CompressedMatrixBlock)inputs.get(i)).decompress());
-			
-			//convert empty or sparse to dense temporary block (note: we don't do
-			//this in place because this block might be used by multiple threads)
-			if( inputs.get(i).isInSparseFormat() && inputs.get(i).isAllocated() ) {
-				MatrixBlock tmp = inputs.get(i);
-				b[i-offset] = DataConverter.convertToDoubleVector(tmp);
-				LOG.warn(getClass().getName()+": Converted "+tmp.getNumRows()+"x"+tmp.getNumColumns()+
-						", nnz="+tmp.getNonZeros()+" sideways input matrix from sparse to dense.");
-			}
-			//use existing dense block
-			else {
-				b[i-offset] = inputs.get(i).getDenseBlock();
-			}
-		}
-		
-		return b;
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, int offset, boolean denseOnly) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, offset, inputs.size()-offset, denseOnly, false);
 	}
 	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesAbstract(inputs, 1, inputs.size()-1);
-	}
-	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs, int offset) 
-		throws DMLRuntimeException 
-	{
-		return prepInputMatricesAbstract(inputs, offset, inputs.size()-offset);
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, boolean denseOnly, boolean tB1) throws DMLRuntimeException {
+		return prepInputMatrices(inputs, 1, inputs.size()-1, denseOnly, tB1);
 	}
 	
-	protected SideInput[] prepInputMatricesAbstract(ArrayList<MatrixBlock> inputs, int offset, int len) 
+	protected SideInput[] prepInputMatrices(ArrayList<MatrixBlock> inputs, int offset, int len, boolean denseOnly, boolean tB1) 
 		throws DMLRuntimeException 
 	{
 		SideInput[] b = new SideInput[len]; 
 		for(int i=offset; i<offset+len; i++) {
+			//decompress if necessary
 			if( inputs.get(i) instanceof CompressedMatrixBlock ) 
 				inputs.set(i, ((CompressedMatrixBlock)inputs.get(i)).decompress());
+			//transpose if necessary
+			int clen = inputs.get(i).getNumColumns();
+			MatrixBlock in = (tB1 && i==1 ) ? LibMatrixReorg.transpose(inputs.get(i), 
+				new MatrixBlock(clen, inputs.get(i).getNumRows(), false)) : inputs.get(i);
 			
-			if( inputs.get(i).isInSparseFormat() && inputs.get(i).isAllocated() )
-				b[i-offset] = new SideInput(null, inputs.get(i));
-			else
-				b[i-offset] = new SideInput(inputs.get(i).getDenseBlock(), null);
+			//create side input
+			if( denseOnly && (in.isInSparseFormat() || !in.isAllocated()) ) {
+				//convert empty or sparse to dense temporary block (note: we don't do
+				//this in place because this block might be used by multiple threads)
+				b[i-offset] = new SideInput(DataConverter.convertToDoubleVector(in), null, clen);
+				LOG.warn(getClass().getName()+": Converted "+in.getNumRows()+"x"+in.getNumColumns()+
+					", nnz="+in.getNonZeros()+" sideways input matrix from sparse to dense.");	
+			}
+			else if( in.isInSparseFormat() && in.isAllocated() ) {
+				b[i-offset] = new SideInput(null, in, clen);
+			}
+			else {
+				b[i-offset] = new SideInput(
+					in.getDenseBlock(), null, clen);
+			}
 		}
 		
 		return b;
 	}
 	
+	public double[][] getDenseMatrices(SideInput[] inputs) {
+		double[][] ret = new double[inputs.length][];
+		for( int i=0; i<inputs.length; i++ )
+			ret[i] = inputs[i].ddat;
+		return ret;
+	}
+	
 	protected double[] prepInputScalars(ArrayList<ScalarObject> scalarObjects) {
 		double[] scalars = new double[scalarObjects.size()]; 
 		for(int i=0; i < scalarObjects.size(); i++)
@@ -161,8 +152,8 @@ public abstract class SpoofOperator implements Serializable
 	
 	protected static double getValue(SideInput data, int rowIndex) {
 		//note: wrapper sideinput guaranteed to exist
-		return (data.dBlock!=null) ? data.dBlock[rowIndex] : 
-			(data.mBlock!=null) ? data.mBlock.quickGetValue(rowIndex, 0) : 0;
+		return (data.ddat!=null) ? data.ddat[rowIndex] : 
+			(data.mdat!=null) ? data.mdat.quickGetValue(rowIndex, 0) : 0;
 	}
 	
 	protected static double getValue(SideInput data, int n, double rowIndex, double colIndex) {
@@ -173,17 +164,19 @@ public abstract class SpoofOperator implements Serializable
 	
 	protected static double getValue(SideInput data, int n, int rowIndex, int colIndex) {
 		//note: wrapper sideinput guaranteed to exist
-		return (data.dBlock!=null) ? data.dBlock[rowIndex*n+colIndex] : 
-			(data.mBlock!=null) ? data.mBlock.quickGetValue(rowIndex, colIndex) : 0;
+		return (data.ddat!=null) ? data.ddat[rowIndex*n+colIndex] : 
+			(data.mdat!=null) ? data.mdat.quickGetValue(rowIndex, colIndex) : 0;
 	}
 	
 	public static class SideInput {
-		private final double[] dBlock;
-		private final MatrixBlock mBlock;
-	
-		public SideInput(double[] ddata, MatrixBlock mdata) {
-			dBlock = ddata;
-			mBlock = mdata;
+		public final double[] ddat;
+		public final MatrixBlock mdat;
+		public final int clen;
+	
+		public SideInput(double[] ddata, MatrixBlock mdata, int clength) {
+			ddat = ddata;
+			mdat = mdata;
+			clen = clength;
 		}
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index c66d065..90c7507 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -79,8 +79,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			return new DoubleObject(0);
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute
@@ -112,8 +112,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			return new DoubleObject(0);
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute
@@ -179,8 +179,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		out.allocateDenseOrSparseBlock();
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 				
 		//core sequential execute
@@ -257,8 +257,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		}	
 		
 		//input preparation
-		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
-		double[][] b = prepInputMatricesDense(inputs, 3);
+		double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false));
+		double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true));
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core sequential execute

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 611e4ad..13536d3 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -46,23 +46,32 @@ public abstract class SpoofRowwise extends SpoofOperator
 	
 	public enum RowType {
 		NO_AGG,    //no aggregation
+		NO_AGG_B1, //no aggregation w/ matrix mult B1
 		FULL_AGG,  //full row/col aggregation
 		ROW_AGG,   //row aggregation (e.g., rowSums() or X %*% v)
 		COL_AGG,   //col aggregation (e.g., colSums() or t(y) %*% X)
-		COL_AGG_T; //transposed col aggregation (e.g., t(X) %*% y)
+		COL_AGG_T, //transposed col aggregation (e.g., t(X) %*% y)
+		COL_AGG_B1,   //col aggregation w/ matrix mult B1
+		COL_AGG_B1_T; //transposed col aggregation w/ matrix mult B1
 		
 		public boolean isColumnAgg() {
-			return (this == COL_AGG || this == COL_AGG_T);
+			return (this == COL_AGG || this == COL_AGG_T)
+				|| (this == COL_AGG_B1) || (this == COL_AGG_B1_T);
 		}
+		public boolean isRowTypeB1() {
+			return (this == NO_AGG_B1) || (this == COL_AGG_B1) || (this == COL_AGG_B1_T);
+		} 
 	}
 	
 	protected final RowType _type;
 	protected final boolean _cbind0;
+	protected final boolean _tB1;
 	protected final int _reqVectMem;
 	
-	public SpoofRowwise(RowType type, boolean cbind0, int reqVectMem) {
+	public SpoofRowwise(RowType type, boolean cbind0, boolean tB1, int reqVectMem) {
 		_type = type;
 		_cbind0 = cbind0;
+		_tB1 = tB1;
 		_reqVectMem = reqVectMem;
 	}
 	
@@ -112,17 +121,18 @@ public abstract class SpoofRowwise extends SpoofOperator
 		//result allocation and preparations
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
+		final int n2 = _type.isRowTypeB1() ? inputs.get(1).getNumColumns() : -1;
 		if( !aggIncr || !out.isAllocated() )
-			allocateOutputMatrix(m, n, out);
+			allocateOutputMatrix(m, n, n2, out);
 		double[] c = out.getDenseBlock();
 		
 		//input preparation
-		double[][] b = prepInputMatricesDense(inputs);
+		SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1);
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//setup thread-local memory if necessary
 		if( allocTmp )
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n);
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n, n2);
 		
 		//core sequential execute
 		MatrixBlock a = inputs.get(0);
@@ -157,10 +167,11 @@ public abstract class SpoofRowwise extends SpoofOperator
 		//result allocation and preparations
 		final int m = inputs.get(0).getNumRows();
 		final int n = inputs.get(0).getNumColumns();
-		allocateOutputMatrix(m, n, out);
+		final int n2 = _type.isRowTypeB1() ? inputs.get(1).getNumColumns() : -1;
+		allocateOutputMatrix(m, n, n2, out);
 		
 		//input preparation
-		double[][] b = prepInputMatricesDense(inputs);
+		SideInput[] b = prepInputMatrices(inputs, 1, inputs.size()-1, true, _tB1);
 		double[] scalars = prepInputScalars(scalarObjects);
 		
 		//core parallel execute
@@ -173,10 +184,10 @@ public abstract class SpoofRowwise extends SpoofOperator
 				//execute tasks
 				ArrayList<ParColAggTask> tasks = new ArrayList<ParColAggTask>();
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParColAggTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+					tasks.add(new ParColAggTask(inputs.get(0), b, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
 				List<Future<double[]>> taskret = pool.invokeAll(tasks);	
 				//aggregate partial results
-				int len = _type.isColumnAgg() ? n : 1;
+				int len = _type.isColumnAgg() ? out.getNumRows()*out.getNumColumns() : 1;
 				for( Future<double[]> task : taskret )
 					LibMatrixMult.vectAdd(task.get(), out.getDenseBlock(), 0, 0, len);
 				out.recomputeNonZeros();
@@ -185,7 +196,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 				//execute tasks
 				ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
 				for( int i=0; i<nk & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(inputs.get(0), b, out, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+					tasks.add(new ParExecTask(inputs.get(0), b, out, scalars, n, n2, i*blklen, Math.min((i+1)*blklen, m)));
 				List<Future<Long>> taskret = pool.invokeAll(tasks);
 				//aggregate nnz, no need to aggregate results
 				long nnz = 0;
@@ -202,18 +213,22 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void allocateOutputMatrix(int m, int n, MatrixBlock out) {
+	private void allocateOutputMatrix(int m, int n, int n2, MatrixBlock out) {
 		switch( _type ) {
-			case NO_AGG: out.reset(m, n, false); break;
-			case FULL_AGG: out.reset(1, 1, false); break;
-			case ROW_AGG: out.reset(m, 1+(_cbind0?1:0), false); break;
-			case COL_AGG: out.reset(1, n, false); break;
-			case COL_AGG_T: out.reset(n, 1, false); break;
+			case NO_AGG:       out.reset(m, n, false); break;
+			case NO_AGG_B1:    out.reset(m, n2, false); break;
+			case FULL_AGG:     out.reset(1, 1, false); break;
+			case ROW_AGG:      out.reset(m, 1+(_cbind0?1:0), false); break;
+			case COL_AGG:      out.reset(1, n, false); break;
+			case COL_AGG_T:    out.reset(n, 1, false); break;
+			case COL_AGG_B1:   out.reset(n2, n, false); break;
+			case COL_AGG_B1_T: out.reset(n, n2, false); break;
+			
 		}
 		out.allocateDenseBlock();
 	}
 	
-	private void executeDense(double[] a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeDense(double[] a, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		if( a == null )
 			return;
@@ -224,7 +239,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeSparse(SparseBlock sblock, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		SparseRow empty = new SparseRowVector(1);
 		for( int i=rl; i<ru; i++ ) {
@@ -243,7 +258,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		}
 	}
 	
-	private void executeCompressed(CompressedMatrixBlock a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	private void executeCompressed(CompressedMatrixBlock a, SideInput[] b, double[] scalars, double[] c, int n, int rl, int ru) 
 	{
 		if( a.isEmptyBlock(false) )
 			return;
@@ -272,10 +287,10 @@ public abstract class SpoofRowwise extends SpoofOperator
 	//methods to be implemented by generated operators of type SpoofRowAggrgate 
 	
 	protected abstract void genexec(double[] a, int ai, 
-		double[][] b, double[] scalars, double[] c, int len, int rowIndex);
+		SideInput[] b, double[] scalars, double[] c, int len, int rowIndex);
 	
 	protected abstract void genexec(double[] avals, int[] aix, int ai, 
-		double[][] b, double[] scalars, double[] c, int alen, int n, int rowIndex);
+		SideInput[] b, double[] scalars, double[] c, int alen, int n, int rowIndex);
 
 	
 	/**
@@ -284,17 +299,19 @@ public abstract class SpoofRowwise extends SpoofOperator
 	private class ParColAggTask implements Callable<double[]> 
 	{
 		private final MatrixBlock _a;
-		private final double[][] _b;
+		private final SideInput[] _b;
 		private final double[] _scalars;
 		private final int _clen;
+		private final int _clen2;
 		private final int _rl;
 		private final int _ru;
 
-		protected ParColAggTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rl, int ru ) {
+		protected ParColAggTask( MatrixBlock a, SideInput[] b, double[] scalars, int clen, int clen2, int rl, int ru ) {
 			_a = a;
 			_b = b;
 			_scalars = scalars;
 			_clen = clen;
+			_clen2 = clen2;
 			_rl = rl;
 			_ru = ru;
 		}
@@ -303,8 +320,8 @@ public abstract class SpoofRowwise extends SpoofOperator
 		public double[] call() throws DMLRuntimeException {
 			
 			//allocate vector intermediates and partial output
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
-			double[] c = new double[_clen];
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
+			double[] c = new double[(_clen2>0)?_clen*_clen2 : _clen];
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, c, _clen, _rl, _ru);
@@ -324,19 +341,21 @@ public abstract class SpoofRowwise extends SpoofOperator
 	private class ParExecTask implements Callable<Long> 
 	{
 		private final MatrixBlock _a;
-		private final double[][] _b;
+		private final SideInput[] _b;
 		private final MatrixBlock _c;
 		private final double[] _scalars;
 		private final int _clen;
+		private final int _clen2;
 		private final int _rl;
 		private final int _ru;
 
-		protected ParExecTask( MatrixBlock a, double[][] b, MatrixBlock c, double[] scalars, int clen, int rl, int ru ) {
+		protected ParExecTask( MatrixBlock a, SideInput[] b, MatrixBlock c, double[] scalars, int clen, int clen2, int rl, int ru ) {
 			_a = a;
 			_b = b;
 			_c = c;
 			_scalars = scalars;
 			_clen = clen;
+			_clen2 = clen2;
 			_rl = rl;
 			_ru = ru;
 		}
@@ -344,7 +363,7 @@ public abstract class SpoofRowwise extends SpoofOperator
 		@Override
 		public Long call() throws DMLRuntimeException {
 			//allocate vector intermediates
-			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen);
+			LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
 			
 			if( _a instanceof CompressedMatrixBlock )
 				executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
index 622944d..663e269 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
@@ -313,7 +313,8 @@ public class SpoofSPInstruction extends SPInstruction
 			}
 			
 			//setup local memory for reuse
-			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen);
+			int clen2 = (int) (_op.getRowType().isRowTypeB1() ? _vectors.get(0).getNumCols() : -1);
+			LibSpoofPrimitives.setupThreadLocalMemory(_op.getNumIntermediates(), _clen, clen2);
 			
 			ArrayList<Tuple2<MatrixIndexes,MatrixBlock>> ret = new ArrayList<Tuple2<MatrixIndexes,MatrixBlock>>();
 			boolean aggIncr = (_op.getRowType().isColumnAgg() //aggregate entire partition

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
index 1a7aeb3..c58eb91 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/data/PartitionedBroadcast.java
@@ -54,6 +54,14 @@ public class PartitionedBroadcast<T extends CacheBlock> implements Serializable
 	public Broadcast<PartitionedBlock<T>>[] getBroadcasts() {
 		return _pbc;
 	}
+	
+	public long getNumRows() {
+		return _pbc[0].value().getNumRows();
+	}
+	
+	public long getNumCols() {
+		return _pbc[0].value().getNumCols();
+	}
 
 	public int getNumRowBlocks() {
 		return _pbc[0].value().getNumRowBlocks();

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 0ed0090..8159dc9 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -42,27 +42,14 @@ import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 /**
- * MB:
- * Library for matrix multiplications including MM, MV, VV for all
+ * MB: Library for matrix multiplications including MM, MV, VV for all
  * combinations of dense, sparse, ultrasparse representations and special
  * operations such as transpose-self matrix multiplication.
- * 
+ * <p>
  * In general all implementations use internally dense outputs
  * for direct access, but change the final result to sparse if necessary.
  * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.  
- * 
- * NOTES on BLAS:
- * * Experiments in 04/2013 showed that even on dense-dense this implementation 
- *   is 3x faster than f2j-BLAS-DGEMM, 2x faster than f2c-BLAS-DGEMM, and
- *   level (+10% after JIT) with a native C implementation. 
- * * Calling native BLAS would loose platform independence and would require 
- *   JNI calls incl data transfer. Furthermore, BLAS does not support sparse 
- *   matrices (except Sparse BLAS, with dedicated function calls and matrix formats) 
- *   and would be an external dependency. 
- * * Experiments in 02/2014 showed that on dense-dense this implementation now achieves
- *   almost 30% peak FP performance. Compared to Intel MKL 11.1 (dgemm, N=1000) it is
- *   just 3.2x (sparsity=1.0) and 1.9x (sparsity=0.5) slower, respectively.  
- *  
+ * <p> 
  */
 public class LibMatrixMult 
 {
@@ -3065,7 +3052,7 @@ public class LibMatrixMult
 			c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ];	
 		}
 	}
-
+	
 	@SuppressWarnings("unused")
 	private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int ci, final int len )
 	{
@@ -3492,12 +3479,16 @@ public class LibMatrixMult
 		return ret;
 	}
 
-	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 )
-	{
+	private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 ) {
 		//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output 
 		return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-				&& m1.rlen > m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64
-				&& 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits in L2 cache
+			&& isSkinnyRightHandSide(m1.rlen, m1.clen, m2.rlen, m2.clen));
+	}
+	
+	//note: public for use by codegen for consistency
+	public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rlen, long m2clen) {
+		return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
+			&& m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE;
 	}
 
 	private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 182adf4..e32056a 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -59,6 +59,10 @@ public class RowAggTmplTest extends AutomatedTestBase
 	private static final String TEST_NAME21 = TEST_NAME+"21"; //sum(X/rowSums(X))
 	private static final String TEST_NAME22 = TEST_NAME+"22"; //((7+X)+(X-7)+exp(X))/(rowMins(X)+0.5) 
 	private static final String TEST_NAME23 = TEST_NAME+"23"; //L2SVM outer loop 
+	private static final String TEST_NAME24 = TEST_NAME+"24"; //t(X)%*%(w*(X%*%v)), w/ mm 
+	private static final String TEST_NAME25 = TEST_NAME+"25"; //-2*(X%*%t(C))+t(rowSums(C^2)), w/ mm
+	private static final String TEST_NAME26 = TEST_NAME+"26"; //t(P)%*%X, w/ mm
+	private static final String TEST_NAME27 = TEST_NAME+"27"; //t(X)%*%(X%*%v), w/ mm 
 	
 	private static final String TEST_DIR = "functions/codegen/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
@@ -70,7 +74,7 @@ public class RowAggTmplTest extends AutomatedTestBase
 	@Override
 	public void setUp() {
 		TestUtils.clearAssertionInformation();
-		for(int i=1; i<=23; i++)
+		for(int i=1; i<=27; i++)
 			addTestConfiguration( TEST_NAME+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) }) );
 	}
 	
@@ -419,6 +423,66 @@ public class RowAggTmplTest extends AutomatedTestBase
 		testCodegenIntegration( TEST_NAME23, false, ExecType.SPARK );
 	}
 	
+	@Test	
+	public void testCodegenRowAggRewrite24CP() {
+		testCodegenIntegration( TEST_NAME24, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg24CP() {
+		testCodegenIntegration( TEST_NAME24, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg24SP() {
+		testCodegenIntegration( TEST_NAME24, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite25CP() {
+		testCodegenIntegration( TEST_NAME25, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg25CP() {
+		testCodegenIntegration( TEST_NAME25, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg25SP() {
+		testCodegenIntegration( TEST_NAME25, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite26CP() {
+		testCodegenIntegration( TEST_NAME26, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg26CP() {
+		testCodegenIntegration( TEST_NAME26, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg26SP() {
+		testCodegenIntegration( TEST_NAME26, false, ExecType.SPARK );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite27CP() {
+		testCodegenIntegration( TEST_NAME27, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg27CP() {
+		testCodegenIntegration( TEST_NAME27, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg27SP() {
+		testCodegenIntegration( TEST_NAME27, false, ExecType.SPARK );
+	}
+	
 	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
 	{	
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern24.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern24.R b/src/test/scripts/functions/codegen/rowAggPattern24.R
new file mode 100644
index 0000000..5510437
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern24.R
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 600, 10, byrow=TRUE);
+w = matrix(seq(1,2400)/2400, 600, 4, byrow=TRUE);
+v = matrix(seq(1,40)/40, 10, 4, byrow=TRUE);
+
+R = t(X) %*% (w * (X %*% v));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern24.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern24.dml b/src/test/scripts/functions/codegen/rowAggPattern24.dml
new file mode 100644
index 0000000..200d552
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern24.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 600, 10);
+w = matrix(seq(1,2400)/2400, 600, 4);
+v = matrix(seq(1,40)/40, 10, 4);
+if(1==1){}
+
+R = t(X) %*% (w * (X %*% v));
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern25.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern25.R b/src/test/scripts/functions/codegen/rowAggPattern25.R
new file mode 100644
index 0000000..0e881bc
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern25.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000), 600, 10, byrow=TRUE);
+C = matrix(seq(1,40), 4, 10, byrow=TRUE);
+
+R = -2 * (X %*% t(C)) + matrix(1,nrow(X),1) %*% t(rowSums(C^2))
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern25.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern25.dml b/src/test/scripts/functions/codegen/rowAggPattern25.dml
new file mode 100644
index 0000000..fa8775e
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern25.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000), 600, 10);
+C = matrix(seq(1,40), 4, 10);
+if(1==1){}
+
+R = -2 * (X %*% t(C)) + t(rowSums(C^2))
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern26.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern26.R b/src/test/scripts/functions/codegen/rowAggPattern26.R
new file mode 100644
index 0000000..736c376
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern26.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000), 600, 10, byrow=TRUE);
+P = matrix(seq(1,3000), 600, 5, byrow=TRUE);
+
+R = t(P) %*% X;
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern26.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern26.dml b/src/test/scripts/functions/codegen/rowAggPattern26.dml
new file mode 100644
index 0000000..f84b556
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern26.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(seq(1,6000), 600, 10);
+P = matrix(seq(1,3000), 600, 5)
+if(1==1){}
+
+R = t(P) %*% X;
+
+write(R, $1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern27.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern27.R b/src/test/scripts/functions/codegen/rowAggPattern27.R
new file mode 100644
index 0000000..4909732
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern27.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+library("matrixStats")
+
+X = matrix(seq(1,6000)/6000, 600, 10, byrow=TRUE);
+v = matrix(seq(1,40)/40, 10, 4, byrow=TRUE);
+
+R = t(X) %*% (X %*% v);
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6b25b3bf/src/test/scripts/functions/codegen/rowAggPattern27.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern27.dml b/src/test/scripts/functions/codegen/rowAggPattern27.dml
new file mode 100644
index 0000000..c5254c2
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern27.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = matrix(seq(1,6000)/6000, 600, 10);
+v = matrix(seq(1,40)/40, 10, 4);
+if(1==1){}
+
+R = t(X) %*% (X %*% v);
+
+write(R, $1)