You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/04/18 07:15:46 UTC

incubator-systemml git commit: [SYSTEMML-1509] Extended codegen compiler integration (runtime plans)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master cd49f224a -> 2bf61b476


[SYSTEMML-1509] Extended codegen compiler integration (runtime plans)

This patch extends the current hop-level compiler integration of codegen
by a runtime-level integration which leaves the original HOP DAGs
unaltered in order to preserve the full fusion potential because partial
fusion can limit the potential during dynamic recompilation. 

Furthermore, this also includes a number of smaller fixes:
* Fix multi-aggregate compilation (constraint on equal input sizes)
* Fix compilation of unnecessary rowwise templates w/ a single operation
* Fix constant compilation in predicate hop dags during initial compile
* Fix construction of invalid cell templates w/ row template references
* Fix the reset of hop dag visit status (which can cause corruption)
* Fix of the validity check probing for common template inputs


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/2bf61b47
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/2bf61b47
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/2bf61b47

Branch: refs/heads/master
Commit: 2bf61b47673ee9ddd68d69dac15db88539bac135
Parents: cd49f22
Author: Matthias Boehm <mb...@gmail.com>
Authored: Tue Apr 18 00:11:06 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Tue Apr 18 00:11:06 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/api/DMLScript.java    |  12 +-
 .../sysml/hops/codegen/SpoofCompiler.java       | 146 ++++++++++++++++---
 .../sysml/hops/codegen/cplan/CNodeTpl.java      |   5 +-
 .../template/PlanSelectionFuseCostBased.java    |  19 ++-
 .../hops/codegen/template/TemplateCell.java     |   9 +-
 .../sysml/hops/rewrite/HopRewriteUtils.java     |   8 +-
 .../org/apache/sysml/parser/DMLTranslator.java  |  11 +-
 .../runtime/codegen/SpoofMultiAggregate.java    |   2 +-
 8 files changed, 174 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index dce9053..84bbf77 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -57,6 +57,7 @@ import org.apache.sysml.hops.HopsException;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.OptimizerUtils.OptimizationLevel;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
+import org.apache.sysml.hops.codegen.SpoofCompiler.IntegrationType;
 import org.apache.sysml.hops.codegen.SpoofCompiler.PlanCachePolicy;
 import org.apache.sysml.hops.globalopt.GlobalOptimizerWrapper;
 import org.apache.sysml.lops.Lop;
@@ -596,13 +597,14 @@ public class DMLScript
 		//Step 5: rewrite HOP DAGs (incl IPA and memory estimates)
 		dmlt.rewriteHopsDAG(prog);
 
-		//Step 5.1: Generate code for the rewrited Hop dags 
+		//Step 5.1: Generate code for the rewritten Hop dags 
 		if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) ){
 			SpoofCompiler.PLAN_CACHE_POLICY = PlanCachePolicy.get(
 					dmlconf.getBooleanValue(DMLConfig.CODEGEN_PLANCACHE),
 					dmlconf.getIntValue(DMLConfig.CODEGEN_LITERALS)==2);
 			SpoofCompiler.setExecTypeSpecificJavaCompiler();
-			dmlt.codgenHopsDAG(prog);
+			if( SpoofCompiler.INTEGRATION==IntegrationType.HOPS )
+				dmlt.codgenHopsDAG(prog);
 		}
 		
 		//Step 6: construct lops (incl exec type and op selection)
@@ -617,6 +619,12 @@ public class DMLScript
 		//Step 7: generate runtime program
 		Program rtprog = prog.getRuntimeProgram(dmlconf);
 
+		//Step 7.1: Generate code for the rewritten Hop dags w/o modify
+		if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) 
+			&& SpoofCompiler.INTEGRATION==IntegrationType.RUNTIME ){
+			dmlt.codgenHopsDAG(rtprog);
+		}
+		
 		//Step 8: [optional global data flow optimization]
 		if(OptimizerUtils.isOptLevel(OptimizationLevel.O4_GLOBAL_TIME_MEMORY) ) 
 		{

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 43b88b0..816b7ae 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops.codegen;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -38,6 +39,7 @@ import org.apache.sysml.hops.codegen.cplan.CNodeCell;
 import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeMultiAgg;
 import org.apache.sysml.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysml.hops.codegen.cplan.CNodeRow;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
@@ -52,6 +54,8 @@ import org.apache.sysml.hops.codegen.template.PlanSelectionFuseNoRedundancy;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntrySet;
 import org.apache.sysml.hops.codegen.template.TemplateUtils;
+import org.apache.sysml.hops.recompile.RecompileStatus;
+import org.apache.sysml.hops.recompile.Recompiler;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.OpOp1;
 import org.apache.sysml.hops.HopsException;
@@ -61,6 +65,7 @@ import org.apache.sysml.hops.rewrite.ProgramRewriteStatus;
 import org.apache.sysml.hops.rewrite.ProgramRewriter;
 import org.apache.sysml.hops.rewrite.RewriteCommonSubexpressionElimination;
 import org.apache.sysml.hops.rewrite.RewriteRemoveUnnecessaryCasts;
+import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.parser.DMLProgram;
 import org.apache.sysml.parser.ForStatement;
 import org.apache.sysml.parser.ForStatementBlock;
@@ -76,6 +81,14 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.codegen.CodegenUtils;
 import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+import org.apache.sysml.runtime.controlprogram.ForProgramBlock;
+import org.apache.sysml.runtime.controlprogram.FunctionProgramBlock;
+import org.apache.sysml.runtime.controlprogram.IfProgramBlock;
+import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
+import org.apache.sysml.runtime.controlprogram.Program;
+import org.apache.sysml.runtime.controlprogram.ProgramBlock;
+import org.apache.sysml.runtime.controlprogram.WhileProgramBlock;
+import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.utils.Explain;
 import org.apache.sysml.utils.Statistics;
@@ -87,6 +100,7 @@ public class SpoofCompiler
 	//internal configuration flags
 	public static boolean LDEBUG                      = false;
 	public static CompilerType JAVA_COMPILER          = CompilerType.JANINO; 
+	public static IntegrationType INTEGRATION         = IntegrationType.HOPS;
 	public static final boolean RECOMPILE_CODEGEN     = true;
 	public static final boolean PRUNE_REDUNDANT_PLANS = true;
 	public static PlanCachePolicy PLAN_CACHE_POLICY   = PlanCachePolicy.CSLH;
@@ -98,6 +112,11 @@ public class SpoofCompiler
 		JANINO,
 	}
 	
+	public enum IntegrationType {
+		HOPS,
+		RUNTIME,
+	}
+	
 	public enum PlanSelector {
 		FUSE_ALL,             //maximal fusion, possible w/ redundant compute
 		FUSE_NO_REDUNDANCY,   //fusion without redundant compute 
@@ -131,24 +150,36 @@ public class SpoofCompiler
 			new RewriteCommonSubexpressionElimination(true),
 			new RewriteRemoveUnnecessaryCasts());
 	
-	public static void generateCode(DMLProgram dmlp) 
+	public static void generateCode(DMLProgram dmlprog) 
 		throws LanguageException, HopsException, DMLRuntimeException
 	{
 		// for each namespace, handle function statement blocks
-		for (String namespaceKey : dmlp.getNamespaces().keySet()) {
-			for (String fname : dmlp.getFunctionStatementBlocks(namespaceKey).keySet()) {
-				FunctionStatementBlock fsblock = dmlp.getFunctionStatementBlock(namespaceKey,fname);
+		for (String namespaceKey : dmlprog.getNamespaces().keySet()) {
+			for (String fname : dmlprog.getFunctionStatementBlocks(namespaceKey).keySet()) {
+				FunctionStatementBlock fsblock = dmlprog.getFunctionStatementBlock(namespaceKey,fname);
 				generateCodeFromStatementBlock(fsblock);
 			}
 		}
 		
 		// handle regular statement blocks in "main" method
-		for (int i = 0; i < dmlp.getNumStatementBlocks(); i++) {
-			StatementBlock current = dmlp.getStatementBlock(i);
+		for (int i = 0; i < dmlprog.getNumStatementBlocks(); i++) {
+			StatementBlock current = dmlprog.getStatementBlock(i);
 			generateCodeFromStatementBlock(current);
 		}
 	}
 	
+	public static void generateCode(Program rtprog) 
+		throws LanguageException, HopsException, DMLRuntimeException, LopsException, IOException
+	{
+		// handle all function program blocks
+		for( FunctionProgramBlock pb : rtprog.getFunctionProgramBlocks().values() )
+			generateCodeFromProgramBlock(pb);
+		
+		// handle regular program blocks in "main" method
+		for( ProgramBlock pb : rtprog.getProgramBlocks() )
+			generateCodeFromProgramBlock(pb);
+	}
+	
 	public static void generateCodeFromStatementBlock(StatementBlock current)
 		throws HopsException, DMLRuntimeException
 	{		
@@ -163,7 +194,7 @@ public class SpoofCompiler
 		{
 			WhileStatementBlock wsb = (WhileStatementBlock) current;
 			WhileStatement wstmt = (WhileStatement)wsb.getStatement(0);
-			wsb.setPredicateHops(optimize(wsb.getPredicateHops(), true));
+			wsb.setPredicateHops(optimize(wsb.getPredicateHops(), false));
 			for (StatementBlock sb : wstmt.getBody())
 				generateCodeFromStatementBlock(sb);
 		}	
@@ -171,7 +202,7 @@ public class SpoofCompiler
 		{
 			IfStatementBlock isb = (IfStatementBlock) current;
 			IfStatement istmt = (IfStatement)isb.getStatement(0);
-			isb.setPredicateHops(optimize(isb.getPredicateHops(), true));
+			isb.setPredicateHops(optimize(isb.getPredicateHops(), false));
 			for (StatementBlock sb : istmt.getIfBody())
 				generateCodeFromStatementBlock(sb);
 			for (StatementBlock sb : istmt.getElseBody())
@@ -181,9 +212,9 @@ public class SpoofCompiler
 		{
 			ForStatementBlock fsb = (ForStatementBlock) current;
 			ForStatement fstmt = (ForStatement)fsb.getStatement(0);
-			fsb.setFromHops(optimize(fsb.getFromHops(), true));
-			fsb.setToHops(optimize(fsb.getToHops(), true));
-			fsb.setIncrementHops(optimize(fsb.getIncrementHops(), true));
+			fsb.setFromHops(optimize(fsb.getFromHops(), false));
+			fsb.setToHops(optimize(fsb.getToHops(), false));
+			fsb.setIncrementHops(optimize(fsb.getIncrementHops(), false));
 			for (StatementBlock sb : fstmt.getBody())
 				generateCodeFromStatementBlock(sb);
 		}
@@ -193,6 +224,56 @@ public class SpoofCompiler
 			current.updateRecompilationFlag();
 		}
 	}
+	
+	public static void generateCodeFromProgramBlock(ProgramBlock current)
+		throws HopsException, DMLRuntimeException, LopsException, IOException
+	{		
+		if (current instanceof FunctionProgramBlock)
+		{
+			FunctionProgramBlock fsb = (FunctionProgramBlock)current;
+			for (ProgramBlock pb : fsb.getChildBlocks())
+				generateCodeFromProgramBlock(pb);
+		}
+		else if (current instanceof WhileProgramBlock)
+		{
+			WhileProgramBlock wpb = (WhileProgramBlock) current;
+			WhileStatementBlock wsb = (WhileStatementBlock)wpb.getStatementBlock();
+			
+			if( wsb!=null && wsb.getPredicateHops()!=null )
+				wpb.setPredicate(generateCodeFromHopDAGsToInst(wsb.getPredicateHops()));
+			for (ProgramBlock sb : wpb.getChildBlocks())
+				generateCodeFromProgramBlock(sb);
+		}
+		else if (current instanceof IfProgramBlock)
+		{
+			IfProgramBlock ipb = (IfProgramBlock) current;
+			IfStatementBlock isb = (IfStatementBlock) ipb.getStatementBlock();
+			if( isb!=null && isb.getPredicateHops()!=null )
+				ipb.setPredicate(generateCodeFromHopDAGsToInst(isb.getPredicateHops()));
+			for (ProgramBlock pb : ipb.getChildBlocksIfBody())
+				generateCodeFromProgramBlock(pb);
+			for (ProgramBlock pb : ipb.getChildBlocksElseBody())
+				generateCodeFromProgramBlock(pb);
+		}
+		else if (current instanceof ForProgramBlock) //incl parfor
+		{
+			ForProgramBlock fpb = (ForProgramBlock) current;
+			ForStatementBlock fsb = (ForStatementBlock) fpb.getStatementBlock();
+			if( fsb!=null && fsb.getFromHops()!=null )
+				fpb.setFromInstructions(generateCodeFromHopDAGsToInst(fsb.getFromHops()));
+			if( fsb!=null && fsb.getToHops()!=null )
+				fpb.setToInstructions(generateCodeFromHopDAGsToInst(fsb.getToHops()));
+			if( fsb!=null && fsb.getIncrementHops()!=null )
+				fpb.setIncrementInstructions(generateCodeFromHopDAGsToInst(fsb.getIncrementHops()));
+			for (ProgramBlock pb : fpb.getChildBlocks())
+				generateCodeFromProgramBlock(pb);
+		}
+		else //generic (last-level)
+		{
+			StatementBlock sb = current.getStatementBlock();
+			current.setInstructions( generateCodeFromHopDAGsToInst(sb, sb.get_hops()) );
+		}
+	}
 
 	public static ArrayList<Hop> generateCodeFromHopDAGs(ArrayList<Hop> roots) 
 		throws HopsException, DMLRuntimeException
@@ -207,6 +288,22 @@ public class SpoofCompiler
 		return optimized;
 	}
 	
+	public static ArrayList<Instruction> generateCodeFromHopDAGsToInst(StatementBlock sb, ArrayList<Hop> roots) 
+		throws DMLRuntimeException, HopsException, LopsException, IOException 
+	{
+		//create copy of hop dag, call codegen, and generate instructions
+		return Recompiler.recompileHopsDag(sb, roots, 
+			new LocalVariableMap(), new RecompileStatus(), false, 0);
+	}
+	
+	public static ArrayList<Instruction> generateCodeFromHopDAGsToInst(Hop root) 
+		throws DMLRuntimeException, HopsException, LopsException, IOException 
+	{
+		//create copy of hop dag, call codegen, and generate instructions
+		return Recompiler.recompileHopsDag(root, 
+			new LocalVariableMap(), new RecompileStatus(), false, 0);
+	}
+	
 	
 	/**
 	 * Main interface of sum-product optimizer, predicate dag.
@@ -321,6 +418,8 @@ public class SpoofCompiler
 			Statistics.incrementCodegenDAGCompile();
 			Statistics.incrementCodegenCompileTime(System.nanoTime()-t0);
 		}
+		
+		Hop.resetVisitStatus(roots);
 			
 		return ret;
 	}
@@ -370,11 +469,12 @@ public class SpoofCompiler
 		memo.pruneSuboptimal(roots);
 		
 		//construct actual cplan representations
+		//note: we do not use the hop visit status due to jumps over fused operators which would
+		//corrupt subsequent resets, leaving partial hops dags in visited status
 		LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
-		Hop.resetVisitStatus(roots);
+		HashSet<Long> visited = new HashSet<Long>();
 		for( Hop hop : roots )
-			rConstructCPlans(hop, memo, ret, compileLiterals);
-		Hop.resetVisitStatus(roots);
+			rConstructCPlans(hop, memo, ret, compileLiterals, visited);
 		
 		return ret;
 	}
@@ -447,11 +547,11 @@ public class SpoofCompiler
 		return P;
 	}
 	
-	private static void rConstructCPlans(Hop hop, CPlanMemoTable memo, HashMap<Long, Pair<Hop[],CNodeTpl>> cplans, boolean compileLiterals) 
+	private static void rConstructCPlans(Hop hop, CPlanMemoTable memo, HashMap<Long, Pair<Hop[],CNodeTpl>> cplans, boolean compileLiterals, HashSet<Long> visited) 
 		throws DMLException
 	{		
 		//top-down memoization of processed dag nodes
-		if( hop == null || hop.isVisited() )
+		if( hop == null || visited.contains(hop.getHopID()) )
 			return;
 		
 		//generate cplan for existing memo table entry
@@ -466,14 +566,14 @@ public class SpoofCompiler
 		//process children recursively, but skip compiled operator
 		if( cplans.containsKey(hop.getHopID()) ) {
 			for( Hop c : cplans.get(hop.getHopID()).getKey() )
-				rConstructCPlans(c, memo, cplans, compileLiterals);
+				rConstructCPlans(c, memo, cplans, compileLiterals, visited);
 		}
 		else {
 			for( Hop c : hop.getInput() )
-				rConstructCPlans(c, memo, cplans, compileLiterals);	
+				rConstructCPlans(c, memo, cplans, compileLiterals, visited);	
 		}
 		
-		hop.setVisited();
+		visited.add(hop.getHopID());
 	}
 	
 	////////////////////
@@ -536,7 +636,8 @@ public class SpoofCompiler
 				hnew = HopRewriteUtils.createUnary(hnew, OpOp1.CAST_AS_MATRIX);
 			}
 			
-			HopRewriteUtils.rewireAllParentChildReferences(hop, hnew);
+			if( !(tmpCNode instanceof CNodeMultiAgg) )
+				HopRewriteUtils.rewireAllParentChildReferences(hop, hnew);
 			memo.add(hnew.getHopID());
 		}
 		
@@ -613,8 +714,9 @@ public class SpoofCompiler
 			}
 			
 			//remove cplan w/ single op and w/o agg
-			if( tpl instanceof CNodeCell && ((((CNodeCell)tpl).getCellType()==CellType.NO_AGG
-				&& TemplateUtils.hasSingleOperation(tpl))|| TemplateUtils.hasNoOperation(tpl)) ) 
+			if( (tpl instanceof CNodeCell && ((((CNodeCell)tpl).getCellType()==CellType.NO_AGG
+				&& TemplateUtils.hasSingleOperation(tpl))|| TemplateUtils.hasNoOperation(tpl)))
+				|| tpl instanceof CNodeRow && TemplateUtils.hasSingleOperation(tpl)) 
 				cplans2.remove(e.getKey());
 				
 			//remove cplan if empty

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
index e6da944..673ab10 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
@@ -240,7 +240,7 @@ public abstract class CNodeTpl extends CNode implements Cloneable
 	}
 	
 	protected static boolean equalInputReferences(CNode current1, CNode current2, ArrayList<CNode> input1, ArrayList<CNode> input2) {
-		boolean ret = (input1.size() == input2.size());
+		boolean ret = (current1.getInput().size() == current2.getInput().size());
 		
 		//process childs recursively
 		for( int i=0; ret && i<current1.getInput().size(); i++ )
@@ -248,7 +248,8 @@ public abstract class CNodeTpl extends CNode implements Cloneable
 					current1.getInput().get(i), current2.getInput().get(i), input1, input2);
 		
 		if( ret && current1 instanceof CNodeData ) {
-			ret &= indexOf(input1, (CNodeData)current1)
+			ret &= current2 instanceof CNodeData
+				&& indexOf(input1, (CNodeData)current1)
 				== indexOf(input2, (CNodeData)current2);
 		}
 		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
index 8ba2490..ae2b076 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
@@ -281,6 +281,7 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 		Hop.resetVisitStatus(roots);
 		for( Hop hop : roots )
 			rCollectFullAggregates(hop, fullAggs);
+		Hop.resetVisitStatus(roots);
 
 		//remove operators with assigned multi-agg plans
 		Iterator<Long> iter = fullAggs.iterator();
@@ -365,10 +366,18 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 	}
 	
 	private static boolean isValidMultiAggregate(CPlanMemoTable memo, MemoTableEntry me) {
+		//ensure input consistent sizes (otherwise potential for incorrect results)
+		boolean ret = true;
+		Hop refSize = memo._hopRefs.get(me.input1).getInput().get(0);
+		for( int i=1; ret && i<3; i++ ) {
+			if( me.isPlanRef(i) )
+				ret &= HopRewriteUtils.isEqualSize(refSize, 
+					memo._hopRefs.get(me.input(i)).getInput().get(0));
+		}
+		
 		//ensure that aggregates are independent of each other, i.e.,
 		//they to not have potentially transitive parent child references
-		boolean ret = true;
-		for( int i=0; i<3; i++ ) 
+		for( int i=0; ret && i<3; i++ ) 
 			if( me.isPlanRef(i) ) {
 				HashSet<Long> probe = new HashSet<Long>();
 				for( int j=0; j<3; j++ )
@@ -891,8 +900,12 @@ public class PlanSelectionFuseCostBased extends PlanSelection
 			for( Long hopID : _aggregates.keySet() )
 				ret &= !that._inputAggs.contains(hopID);
 			//check partial shared reads
-			return ret && !CollectionUtils.intersection(
+			ret &= !CollectionUtils.intersection(
 				_fusedInputs, that._fusedInputs).isEmpty();
+			//check consistent sizes (result correctness)
+			return ret && HopRewriteUtils.isEqualSize(
+				_aggregates.values().iterator().next().getInput().get(0),
+				that._aggregates.values().iterator().next().getInput().get(0));
 		}
 		public AggregateInfo merge(AggregateInfo that) {
 			_aggregates.putAll(that._aggregates);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
index d5ac99c..434fa59 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateCell.java
@@ -146,8 +146,15 @@ public class TemplateCell extends TemplateBase
 		if( tmp.containsKey(hop.getHopID()) )
 			return;
 		
-		//recursively process required childs
 		MemoTableEntry me = memo.getBest(hop.getHopID(), TemplateType.CellTpl);
+		
+		//recursively process required childs
+		if( me!=null && (me.type == TemplateType.RowTpl || me.type == TemplateType.OuterProdTpl) ) {
+			CNodeData cdata = TemplateUtils.createCNodeData(hop, compileLiterals);	
+			tmp.put(hop.getHopID(), cdata);
+			inHops.add(hop);
+			return;
+		}
 		for( int i=0; i<hop.getInput().size(); i++ ) {
 			Hop c = hop.getInput().get(i);
 			if( me!=null && me.isPlanRef(i) && !(c instanceof DataOp)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index 0bad2f6..ba5d1ab 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -224,10 +224,8 @@ public class HopRewriteUtils
 	
 	
 
-	public static int getChildReferencePos( Hop parent, Hop child )
-	{
-		ArrayList<Hop> childs = parent.getInput();
-		return childs.indexOf(child);
+	public static int getChildReferencePos( Hop parent, Hop child ) {
+		return parent.getInput().indexOf(child);
 	}
 	
 	public static void removeChildReference( Hop parent, Hop child ) {
@@ -263,7 +261,7 @@ public class HopRewriteUtils
 	public static void rewireAllParentChildReferences( Hop hold, Hop hnew ) {
 		ArrayList<Hop> parents = new ArrayList<Hop>(hold.getParent());
 		for( Hop lparent : parents )
-			HopRewriteUtils.replaceChildReference(lparent, hold, hnew);	
+			HopRewriteUtils.replaceChildReference(lparent, hold, hnew);
 	}
 	
 	public static void replaceChildReference( Hop parent, Hop inOld, Hop inNew ) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index 2b8128b..6e4db6e 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -71,6 +71,7 @@ import org.apache.sysml.parser.Expression.ParameterizedBuiltinFunctionOp;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.parser.PrintStatement.PRINTTYPE;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.Program;
 
 
 public class DMLTranslator 
@@ -280,10 +281,16 @@ public class DMLTranslator
 		resetHopsDAGVisitStatus(dmlp);
 	}
 	
-	public void codgenHopsDAG(DMLProgram dmlp) 
+	public void codgenHopsDAG(DMLProgram dmlp)
 		throws LanguageException, HopsException, DMLRuntimeException 
 	{
-		SpoofCompiler.generateCode(dmlp);	
+		SpoofCompiler.generateCode(dmlp);
+	}
+	
+	public void codgenHopsDAG(Program rtprog)
+		throws LanguageException, HopsException, DMLRuntimeException, LopsException, IOException 
+	{
+		SpoofCompiler.generateCode(rtprog);
 	}
 	
 	public void constructLops(DMLProgram dmlp) throws ParseException, LanguageException, HopsException, LopsException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2bf61b47/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
index 0de4916..fd72631 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofMultiAggregate.java
@@ -125,7 +125,7 @@ public abstract class SpoofMultiAggregate extends SpoofOperator implements Seria
 		}
 	
 		//post-processing
-		out.recomputeNonZeros();	
+		out.recomputeNonZeros();
 		out.examSparsity();	
 	}