You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/05/30 05:36:29 UTC

[2/2] incubator-systemml git commit: [SYSTEMML-1289] Codegen outer product ops over compressed matrices

[SYSTEMML-1289] Codegen outer product ops over compressed matrices

This patch extends the codegen outer product template with support for
compressed matrices. The basic approach is use sparse-safe column
iterators to process all relevant values without decompression. For left
outer product operations, we parallelize over column group partitions
and extends the column group iterator accordingly. 

Furthermore, this patch also includes a variety of smaller bug fixes:

1) Fix ColGroupOffset iterator (OLE, RLE) to handle the case of
iterators without row indexes in a specified row range.
2) Fix result correctness of sparse cell-wise outer products (output
indexing).
3) Fix outer product handling of empty input matrices.
4) Fix the unnecessary casting of the number of non-zeros to int, which
could lead to truncation/overflows because it's represented as long.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/9dad2fe8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/9dad2fe8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/9dad2fe8

Branch: refs/heads/master
Commit: 9dad2fe8b39d698b6b289955e249e27dc4e2b8a1
Parents: 8738121
Author: Matthias Boehm <mb...@gmail.com>
Authored: Mon May 29 22:24:11 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Mon May 29 22:36:13 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |   2 +
 .../hops/codegen/cplan/CNodeOuterProduct.java   |   4 +-
 .../runtime/codegen/SpoofOuterProduct.java      | 214 +++++++++++----
 .../sysml/runtime/compress/ColGroupOLE.java     |   2 +
 .../sysml/runtime/compress/ColGroupOffset.java  |  35 +--
 .../sysml/runtime/compress/ColGroupRLE.java     |   2 +
 .../runtime/compress/CompressedMatrixBlock.java |  19 +-
 .../codegen/CompressedCellwiseTest.java         |   1 +
 .../codegen/CompressedMultiAggregateTest.java   |   1 +
 .../codegen/CompressedOuterProductTest.java     | 267 +++++++++++++++++++
 .../codegen/CompressedOuterProductMain.R        |  36 +++
 .../codegen/CompressedOuterProductMain.dml      |  33 +++
 .../functions/codegen/ZPackageSuite.java        |   1 +
 13 files changed, 535 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index d87c107..d96dda1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -411,6 +411,8 @@ public class SpoofCompiler
 			}
 		}
 		catch( Exception ex ) {
+			LOG.error("Codegen failed to optimize the following HOP DAG: \n" + 
+				Explain.explainHops(roots));
 			throw new DMLRuntimeException(ex);
 		}
 		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
index 0dd6d62..aaf6f20 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
@@ -39,10 +39,10 @@ public class CNodeOuterProduct extends CNodeTpl
 			+ "  public %TMP%() {\n"
 			+ "    _outerProductType = OutProdType.%TYPE%;\n"
 			+ "  }\n"
-			+ "  protected void genexecDense( double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int n, int m, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex) { \n"
 			+ "%BODY_dense%"
 			+ "  }\n"
-			+ "  protected double genexecCellwise( double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int n, int m, int k, int rowIndex, int colIndex) { \n"
+			+ "  protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int k, int rowIndex, int colIndex) { \n"
 			+ "%BODY_cellwise%"
 			+ "    return %OUT_cellwise%;\n"
 			+ "  }\n"			

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index 64470ea..9d0203e 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.codegen;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -28,8 +29,10 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.IJV;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
 
@@ -72,6 +75,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		//sanity check
 		if( inputs==null || inputs.size() < 3 )
 			throw new RuntimeException("Invalid input arguments.");
+		if( inputs.get(0).isEmptyBlock(false) )
+			return new DoubleObject(0);
 		
 		//input preparation
 		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
@@ -87,10 +92,12 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		MatrixBlock out = new MatrixBlock(1, 1, false);
 		out.allocateDenseBlock();
 		
-		if(!a.isInSparseFormat())
-			executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+		if( a instanceof CompressedMatrixBlock )
+			executeCellwiseCompressed((CompressedMatrixBlock)a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, 0, n);
+		else if( !a.isInSparseFormat() )
+			executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
 		else
-			executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+			executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n);
 		return new DoubleObject(out.getDenseBlock()[0]);
 	}
 	
@@ -101,6 +108,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		//sanity check
 		if( inputs==null || inputs.size() < 3 )
 			throw new RuntimeException("Invalid input arguments.");
+		if( inputs.get(0).isEmptyBlock(false) )
+			return new DoubleObject(0);
 		
 		//input preparation
 		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
@@ -114,14 +123,14 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		double sum = 0;
 		
 		try 
-		{			
+		{
 			ExecutorService pool = Executors.newFixedThreadPool(k);
 			ArrayList<ParOuterProdAggTask> tasks = new ArrayList<ParOuterProdAggTask>();			
 			//create tasks (for wdivmm-left, parallelization over columns;
 			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
 			int blklen = (int)(Math.ceil((double)m/numThreads));
 			for( int i=0; i<numThreads & i*blklen<m; i++ )
-				tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, n, m, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+				tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
 			//execute tasks
 			List<Future<Double>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
@@ -154,17 +163,20 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		if(_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) {
 			//assign it to the time and sparse representation of the major input matrix
 			out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat());
-			out.allocateDenseOrSparseBlock();
 		}		
 		else {	
 			//if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
 			//if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT &&  out.getNumRows() == inputs.get(2).getNumColumns() &&  out.getNumColumns() == inputs.get(2).getNumRows())
 			if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
-				out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns()); // n*k
+				out.reset(inputs.get(0).getNumColumns(), inputs.get(1).getNumColumns(), false); // n*k
 			else if(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT )
-				out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns()); // m*k
-			out.allocateDenseBlock();
-		}			
+				out.reset(inputs.get(0).getNumRows(), inputs.get(1).getNumColumns(), false); // m*k
+		}
+		
+		//check for empty inputs; otherwise allocate result
+		if( inputs.get(0).isEmptyBlock(false) )
+			return;
+		out.allocateDenseOrSparseBlock();
 		
 		//input preparation
 		double[][] ab = prepInputMatricesDense(inputs, 1, 2);
@@ -181,17 +193,21 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		switch(_outerProductType) {
 			case LEFT_OUTER_PRODUCT:	
 			case RIGHT_OUTER_PRODUCT:
-				if( !a.isInSparseFormat() )
-					executeDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+				if( a instanceof CompressedMatrixBlock )
+					executeCompressed((CompressedMatrixBlock)a, ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
+				else if( !a.isInSparseFormat() )
+					executeDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
 				else
-					executeSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+					executeSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n);
 				break;
 				
 			case CELLWISE_OUTER_PRODUCT:
-				if( !a.isInSparseFormat() )
-					executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+				if( a instanceof CompressedMatrixBlock )
+					executeCellwiseCompressed((CompressedMatrixBlock)a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, 0, m, 0, n);
+				else if( !a.isInSparseFormat() )
+					executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n);
 				else 
-					executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+					executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n);
 				break;
 	
 			case AGG_OUTER_PRODUCT:
@@ -199,6 +215,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		}
 		
 		//post-processing
+		if( a instanceof CompressedMatrixBlock && out.isInSparseFormat()
+			&& _outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT )
+			out.sortSparseRows();
 		out.recomputeNonZeros();
 		out.examSparsity();
 	}
@@ -230,10 +249,10 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		{
 			//if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
 			//if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT &&  out.getNumRows() == inputs.get(2).getNumColumns() &&  out.getNumColumns() == inputs.get(2).getNumRows())
-			if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
-				out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns()); // n*k
-			else if(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT )
-				out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns()); // m*k
+			if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns(), false); // n*k
+			else if( _outerProductType == OutProdType.RIGHT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns(), false); // m*k
 			out.allocateDenseBlock();
 		}	
 		
@@ -247,6 +266,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		final int n = inputs.get(0).getNumColumns();	
 		final int k = inputs.get(1).getNumColumns(); // rank
 		
+		MatrixBlock a = inputs.get(0);
+		
 		try 
 		{			
 			ExecutorService pool = Executors.newFixedThreadPool(numThreads);
@@ -255,14 +276,25 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
 			
 			if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ) {
-				int blklen = (int)(Math.ceil((double)n/numThreads));
-				for( int j=0; j<numThreads & j*blklen<n; j++ )
-					tasks.add(new ParExecTask(inputs.get(0), ab[0], ab[1], b, scalars, out, n, m, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, n)));
+				if( a instanceof CompressedMatrixBlock ) {
+					//parallelize over column groups
+					int numCG = ((CompressedMatrixBlock)a).getNumColGroups();
+					int blklen = (int)(Math.ceil((double)numCG/numThreads));
+					for( int j=0; j<numThreads & j*blklen<numCG; j++ )
+						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, numCG)));
+				}
+				else {
+					//parallelize over column partitions
+					int blklen = (int)(Math.ceil((double)n/numThreads));
+					for( int j=0; j<numThreads & j*blklen<n; j++ )
+						tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, n)));
+				}
 			}
-			else { ///right // cellwise
+			else { //right or cell-wise
+				//parallelize over row partitions
 				int blklen = (int)(Math.ceil((double)m/numThreads));
 				for( int i=0; i<numThreads & i*blklen<m; i++ )
-					tasks.add(new ParExecTask(inputs.get(0), ab[0], ab[1], b, scalars, out, n, m, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+					tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
 			}
 			List<Future<Long>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
@@ -271,13 +303,20 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		} 
 		catch (Exception e) {
 			throw new DMLRuntimeException(e);
-		} 
+		}
 		
 		//post-processing
+		if( a instanceof CompressedMatrixBlock ) { 
+			if( out.isInSparseFormat() && _outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT )
+				out.sortSparseRows();
+			else if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
+				out.recomputeNonZeros();
+		}
 		out.examSparsity();
 	}
 	
-	private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
+	private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars, 
+		double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
 	{
 		//approach: iterate over non-zeros of w, selective mm computation
 		//cache-conscious blocking: due to blocksize constraint (default 1000),
@@ -296,12 +335,13 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 					for( int j=bj, vix=bj*k; j<bjmin; j++, vix+=k)
 						if( a[ix+j] != 0 ) {
 							cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix;
-							genexecDense( a[ix+j], u, uix, v, vix, b, scalars, c, cix, n, m, k, i,j);//(ix+j)/n, (ix+j)%n ); 
+							genexecDense( a[ix+j], u, uix, v, vix, b, scalars, c, cix, m, n, k, i, j); 
 						}
 			}
 	}
 	
-	private void executeCellwiseDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
+	private void executeCellwiseDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars, 
+		double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
 	{
 		//approach: iterate over non-zeros of w, selective mm computation
 		//cache-conscious blocking: due to blocksize constraint (default 1000),
@@ -320,14 +360,15 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 						if( a[ix+j] != 0 ) {
 							//int cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix;
 							if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
-								c[ix+j] = genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, n, m, k, i, j ); 
+								c[ix+j] = genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, m, n, k, i, j ); 
 							else
-								c[0]  += genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, n, m, k, i, j); // (ix+j)/n, (ix+j)%n ); 	
+								c[0]  += genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, m, n, k, i, j); 	
 						}
 			}
 	}
 	
-	private void executeSparse(SparseBlock sblock,  double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, int nnz, OutProdType type, int rl, int ru, int cl, int cu) 
+	private void executeSparse(SparseBlock sblock,  double[] u, double[] v, double[][] b, double[] scalars, 
+		double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) 
 	{
 		boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT);
 		
@@ -364,7 +405,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 					int index = wpos + curk[i-bi];
 					for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
 						genexecDense( wval[index], u, uix, v, wix[index]*k, b, scalars, c, 
-								(left ? wix[index]*k : uix), n, m, k, i, wix[index] );
+								(left ? wix[index]*k : uix), m, n, k, i, wix[index] );
 					}
 					curk[i-bi] = index - wpos;
 				}
@@ -372,7 +413,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		}
 	}
 	
-	private void executeCellwiseSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock out, int n, int m, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) 
+	private void executeCellwiseSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars, 
+		MatrixBlock out, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) 
 	{
 		final int blocksizeIJ = (int) (8L*m*n/nnz); 
 		int[] curk = new int[blocksizeIJ];			
@@ -396,9 +438,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 						int index = wpos + curk[i-bi];
 						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
 							if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
-								c[index] = genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, n, m, k, i, wix[index] ); 
+								c[wix[index]] = genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] ); 
 							else
-								c[0] += genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, n, m, k, i, wix[index]); // (ix+j)/n, (ix+j)%n );
+								c[0] += genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]);
 						}
 						curk[i-bi] = index - wpos;
 					}
@@ -423,8 +465,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 						double[] wval = sblock.values(i);
 						int index = wpos + curk[i-bi];
 						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
-							c.append(i, index, genexecCellwise( wval[index], u, uix, v, 
-									wix[index]*k, b, scalars, n, m, k, i, wix[index] )); 
+							c.append(i, wix[index], genexecCellwise( wval[index], u, uix, v, 
+									wix[index]*k, b, scalars, m, n, k, i, wix[index] )); 
 						}
 						curk[i-bi] = index - wpos;
 					}
@@ -432,10 +474,56 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			}
 		}
 	}
+	
+	private void executeCompressed(CompressedMatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, 
+			double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu) 
+	{
+		boolean left = (_outerProductType==OutProdType.LEFT_OUTER_PRODUCT);
+		
+		Iterator<IJV> iter = !left ? a.getIterator(rl, ru, false) : 
+			a.getIterator(rl, ru, cl, cu, false); //cl/cu -> colgroups
+		while( iter.hasNext() ) {
+			IJV cell = iter.next();
+			int uix = cell.getI() * k;
+			int vix = cell.getJ() * k;
+			genexecDense(cell.getV(), u, uix, v, vix, b, scalars, c, 
+				left ? vix : uix, m, n, k, cell.getI(), cell.getJ());
+		}
+	}
+	
+	private void executeCellwiseCompressed(CompressedMatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, 
+		MatrixBlock out, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
+	{			
+		double[] c = out.getDenseBlock();
+		SparseBlock csblock = out.getSparseBlock();
+		
+		Iterator<IJV> iter = a.getIterator(rl, ru, false);
+		while( iter.hasNext() ) {
+			IJV cell = iter.next();
+			int uix = cell.getI() * k;
+			int vix = cell.getJ() * k;
+			if( type == OutProdType.CELLWISE_OUTER_PRODUCT ) {
+				if( out.isInSparseFormat() ) {
+					csblock.allocate(cell.getI());
+					csblock.append(cell.getI(), cell.getJ(), 
+						genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ()));
+				}
+				else {
+					c[cell.getI()*n+cell.getJ()] = 
+						genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ());
+				}
+			}
+			else {
+				c[0] += genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ());
+			}
+		}
+	}
 
-	protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, double[][] b, double[] scalars , double[] c, int ci, int n, int m, int k, int rowIndex, int colIndex );
+	protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, double[][] b, 
+		double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex);
 	
-	protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, double[][] b, double[] scalars , int n, int m, int k, int rowIndex, int colIndex);
+	protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, double[][] b, 
+		double[] scalars, int m, int n, int k, int rowIndex, int colIndex);
 
 	private class ParExecTask implements Callable<Long> 
 	{
@@ -454,15 +542,15 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		private final int _cl;
 		private final int _cu;
 		
-		protected ParExecTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock c, int clen, int rlen, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
+		protected ParExecTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
 			_a = a;
 			_u = u;
 			_v = v;
 			_b = b;
 			_c = c;
 			_scalars = scalars;
-			_clen = clen;
-			_rlen = rlen;
+			_rlen = m;
+			_clen = n;
 			_k = k;
 			_type = type;
 			_rl = rl;
@@ -477,24 +565,30 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 			{
 				case LEFT_OUTER_PRODUCT:	
 				case RIGHT_OUTER_PRODUCT:
-					if( !_a.isInSparseFormat() )
-						executeDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+					if( _a instanceof CompressedMatrixBlock )
+						executeCompressed((CompressedMatrixBlock)_a, _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
+					else if( !_a.isInSparseFormat() )
+						executeDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
 					else
-						executeSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, (int) _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
+						executeSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
 					break;
 				case CELLWISE_OUTER_PRODUCT:
-					if( !_c.isInSparseFormat() )
-						executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+					if( _a instanceof CompressedMatrixBlock )
+						executeCellwiseCompressed((CompressedMatrixBlock)_a, _u, _v, _b, _scalars, _c, _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
+					else if( !_c.isInSparseFormat() )
+						executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
 					else 
-						executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c, _clen, _rlen, _k, (int) _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
+						executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c, _rlen, _clen, _k, _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
 					break;			
 				case AGG_OUTER_PRODUCT:
 					throw new DMLRuntimeException("Wrong codepath for aggregate outer product.");
 			}
 			
-			int rl = _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ? _cl : _rl;
-			int ru = _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ? _cu : _ru;
-			return _c.recomputeNonZeros(rl, ru-1, 0, _c.getNumColumns()-1);
+			boolean left = (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT);
+			int rl = left ? _cl : _rl;
+			int ru = left ? _cu : _ru;
+			return (_a instanceof CompressedMatrixBlock && left) ? -1 :
+				_c.recomputeNonZeros(rl, ru-1, 0, _c.getNumColumns()-1);
 		}
 	}
 	
@@ -505,8 +599,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		private final double[] _v;
 		private final double[][] _b;
 		private final double[] _scalars;
-		private final int _clen;
 		private final int _rlen;
+		private final int _clen;
 		private final int _k;
 		private final OutProdType _type;
 		private final int _rl;
@@ -514,14 +608,14 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		private final int _cl;
 		private final int _cu;
 		
-		protected ParOuterProdAggTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, int clen, int rlen, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
+		protected ParOuterProdAggTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
 			_a = a;
 			_u = u;
 			_v = v;
 			_b = b;
 			_scalars = scalars;
-			_clen = clen;
-			_rlen = rlen;
+			_rlen = m;
+			_clen = n;
 			_k = k;
 			_type = type;
 			_rl = rl;
@@ -534,10 +628,12 @@ public abstract class SpoofOuterProduct extends SpoofOperator
 		public Double call() throws DMLRuntimeException {
 			MatrixBlock out = new MatrixBlock(1, 1, false);
 			out.allocateDenseBlock();
-			if(!_a.isInSparseFormat())
-				executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, out.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+			if( _a instanceof CompressedMatrixBlock )
+				executeCellwiseCompressed((CompressedMatrixBlock)_a, _u, _v, _b, _scalars, out, _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
+			else if( !_a.isInSparseFormat() )
+				executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, out.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu);
 			else
-				executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, out, _clen, _rlen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu);
+				executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, out, _rlen, _clen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu);
 			return out.getDenseBlock()[0];
 		}
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
index f100495..ac0b803 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -798,6 +798,8 @@ public class ColGroupOLE extends ColGroupOffset
 
 		@Override
 		public Integer next() {
+			if( !hasNext() )
+				throw new RuntimeException("No more OLE entries.");
 			int ret = _rpos;
 			nextRowOffset();
 			return ret;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOffset.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOffset.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOffset.java
index 184c69a..5fc4a5a 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOffset.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOffset.java
@@ -451,7 +451,7 @@ public abstract class ColGroupOffset extends ColGroupValue
 			_rl = rl;
 			_ru = ru;
 			_inclZeros = inclZeros;
-			_vpos = 0;
+			_vpos = -1;
 			_rpos = -1;
 			_cpos = 0;
 			getNextValue();
@@ -461,7 +461,7 @@ public abstract class ColGroupOffset extends ColGroupValue
 		public boolean hasNext() {
 			return (_rpos < _ru);
 		}
-
+		
 		@Override
 		public IJV next() {
 			_buff.set(_rpos, _colIndexes[_cpos], (_vpos >= getNumValues()) ? 
@@ -472,27 +472,28 @@ public abstract class ColGroupOffset extends ColGroupValue
 		
 		private void getNextValue() {
 			//advance to next value iterator if required
-			if( _viter == null ) {
-				_viter = (getNumValues()>0) ? //first iterator
-					getIterator(_vpos, _rl, _ru) : new ZeroIterator(_rl, _ru);
-			}
-			else if( _viter instanceof ZeroIterator && !_viter.hasNext() ) {
+			if(_viter != null && _viter instanceof ZeroIterator && !_viter.hasNext() ) {
 				_rpos = _ru; //end after zero iterator
 				return;
 			}
-			else if( _cpos+1 >= getNumCols() && !_viter.hasNext() ) {
-				_vpos++; //
-				if( _vpos < getNumValues() )
-					_viter = getIterator(_vpos, _rl, _ru);
-				else if( _inclZeros && _zeros)
-					_viter = new ZeroIterator(_rl, _ru);
-				else
-					_rpos = _ru; //end w/o zero iterator
+			else if( _cpos+1 >= getNumCols() && !(_viter!=null && _viter.hasNext()) ) {
+				do {
+					_vpos++;
+					if( _vpos < getNumValues() )
+						_viter = getIterator(_vpos, _rl, _ru);
+					else if( _inclZeros && _zeros)
+						_viter = new ZeroIterator(_rl, _ru);
+					else {
+						_rpos = _ru; //end w/o zero iterator
+						return;
+					}
+				}
+				while(!_viter.hasNext());
 				_rpos = -1;
 			}
 			
-			//get next value
-			if( _rpos < 0 || _cpos+1 >= getNumCols()) {
+			//get next value from valid iterator
+			if( _rpos < 0 || _cpos+1 >= getNumCols() ) {
 				_rpos = _viter.next();
 				_cpos = 0;
 			}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
index 7f0dcf9..5b1f804 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
@@ -802,6 +802,8 @@ public class ColGroupRLE extends ColGroupOffset
 
 		@Override
 		public Integer next() {
+			if( !hasNext() )
+				throw new RuntimeException("No more RLE entries.");
 			int ret = _rpos;
 			nextRowOffset();
 			return ret;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
index d3bdcca..0fbe608 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
@@ -169,6 +169,10 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 	public ArrayList<ColGroup> getColGroups() {
 		return _colGroups;
 	}
+	
+	public int getNumColGroups() {
+		return _colGroups.size();
+	}
 
 	/**
 	 * Obtain whether this block is in compressed form or not.
@@ -802,7 +806,11 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 	}
 	
 	public Iterator<IJV> getIterator(int rl, int ru, boolean inclZeros) {
-		return new ColumnGroupIterator(rl, ru, inclZeros);
+		return getIterator(rl, ru, 0, getNumColGroups(), inclZeros);
+	}
+	
+	public Iterator<IJV> getIterator(int rl, int ru, int cgl, int cgu, boolean inclZeros) {
+		return new ColumnGroupIterator(rl, ru, cgl, cgu, inclZeros);
 	}
 	
 	//////////////////////////////////////////
@@ -2229,6 +2237,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		//iterator configuration 
 		private final int _rl;
 		private final int _ru;
+		private final int _cgu;
 		private final boolean _inclZeros;
 		
 		//iterator state
@@ -2236,10 +2245,12 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		private Iterator<IJV> _iterColGroup = null;
 		private boolean _noNext = false;
 		
-		public ColumnGroupIterator(int rl, int ru, boolean inclZeros) {
+		public ColumnGroupIterator(int rl, int ru, int cgl, int cgu, boolean inclZeros) {
 			_rl = rl;
 			_ru = ru;
+			_cgu = cgu;
 			_inclZeros = inclZeros;
+			_posColGroup = cgl-1;
 			getNextIterator();
 		}
 
@@ -2252,14 +2263,14 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		public IJV next() {
 			if( _noNext )
 				throw new RuntimeException("No more entries.");
-			IJV ret = _iterColGroup.next(); 
+			IJV ret = _iterColGroup.next();
 			if( !_iterColGroup.hasNext() )
 				getNextIterator();
 			return ret;
 		}
 		
 		private void getNextIterator() {
-			while( _posColGroup+1 < _colGroups.size() ) {
+			while( _posColGroup+1 < _cgu ) {
 				_posColGroup++;
 				_iterColGroup = _colGroups.get(_posColGroup)
 					.getIterator(_rl, _ru, _inclZeros);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedCellwiseTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedCellwiseTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedCellwiseTest.java
index a94cdb2..417905d 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedCellwiseTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedCellwiseTest.java
@@ -315,6 +315,7 @@ public class CompressedCellwiseTest extends AutomatedTestBase
 			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrites;
 			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
 			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+			CompressedMatrixBlock.ALLOW_DDC_ENCODING = true;
 		}
 	}	
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedMultiAggregateTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedMultiAggregateTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedMultiAggregateTest.java
index 77b1294..4a16694 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedMultiAggregateTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedMultiAggregateTest.java
@@ -315,6 +315,7 @@ public class CompressedMultiAggregateTest extends AutomatedTestBase
 			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrites;
 			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
 			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+			CompressedMatrixBlock.ALLOW_DDC_ENCODING = true;
 		}
 	}	
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedOuterProductTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedOuterProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedOuterProductTest.java
new file mode 100644
index 0000000..8c3a695
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CompressedOuterProductTest.java
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.io.File;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class CompressedOuterProductTest extends AutomatedTestBase 
+{	
+	private static final String TEST_NAME1 = "CompressedOuterProductMain";
+	private static final String TEST_DIR = "functions/codegen/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CompressedOuterProductTest.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen-compress.xml";
+	private final static File   TEST_CONF_FILE = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF);
+	
+	private static final int rows = 2023;
+	private static final int cols = 1987;
+	private static final double sparsity1 = 0.9;
+	private static final double sparsity2 = 0.1;
+	private static final double sparsity3 = 0.0;
+	private static final double eps = Math.pow(10, -6);
+	
+	public enum SparsityType {
+		DENSE,
+		SPARSE,
+		EMPTY,
+	}
+	
+	public enum ValueType {
+		RAND, //UC
+		CONST, //RLE
+		RAND_ROUND_OLE, //OLE
+		RAND_ROUND_DDC, //RLE
+	}
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "R" }) );
+	}
+		
+	@Test
+	public void testCompressedCellwiseMainDenseConstCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.CONST, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRandCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRand2CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND_ROUND_DDC, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRand3CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND_ROUND_OLE, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseConstCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.CONST, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRandCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRand2CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND_ROUND_DDC, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRand3CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND_ROUND_OLE, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyConstCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.CONST, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRandCP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRand2CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND_ROUND_DDC, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRand3CP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND_ROUND_OLE, ExecType.CP );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseConstSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.CONST, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRandSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRand2SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND_ROUND_DDC, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainDenseRand3SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.DENSE, ValueType.RAND_ROUND_OLE, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseConstSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.CONST, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRandSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRand2SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND_ROUND_DDC, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainSparseRand3SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.SPARSE, ValueType.RAND_ROUND_OLE, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyConstSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.CONST, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRandSP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRand2SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND_ROUND_DDC, ExecType.SPARK );
+	}
+	
+	@Test
+	public void testCompressedCellwiseMainEmptyRand3SP() {
+		testCompressedCellwise( TEST_NAME1, SparsityType.EMPTY, ValueType.RAND_ROUND_OLE, ExecType.SPARK );
+	}
+	
+	private void testCompressedCellwise(String testname, SparsityType stype, ValueType vtype, ExecType et)
+	{	
+		boolean oldRewrites = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( et ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+		
+		try
+		{
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = true;
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "-stats", 
+					"-args", input("X"), output("R") };
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(inputDir(), expectedDir());			
+
+			//generate input data
+			double sparsity = -1;
+			switch( stype ){
+				case DENSE: sparsity = sparsity1; break;
+				case SPARSE: sparsity = sparsity2; break;
+				case EMPTY: sparsity = sparsity3; break;
+			}
+			
+			//generate input data
+			double min = (vtype==ValueType.CONST)? 10 : -10;
+			double[][] X = TestUtils.generateTestMatrix(rows, cols, min, 10, sparsity, 7);
+			if( vtype==ValueType.RAND_ROUND_OLE || vtype==ValueType.RAND_ROUND_DDC ) {
+				CompressedMatrixBlock.ALLOW_DDC_ENCODING = (vtype==ValueType.RAND_ROUND_DDC);
+				X = TestUtils.round(X);
+			}
+			writeInputMatrixWithMTD("X", X, true);
+			
+			//run tests
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("R");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("R");	
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoofOP") 
+				|| heavyHittersContainsSubString("sp_spoofOP"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrites;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+			CompressedMatrixBlock.ALLOW_DDC_ENCODING = true;
+		}
+	}	
+
+	/**
+	 * Override default configuration with custom test configuration to ensure
+	 * scratch space and local temporary directory locations are also updated.
+	 */
+	@Override
+	protected File getConfigTemplateFile() {
+		// Instrumentation in this test's output log to show custom configuration file used for template.
+		System.out.println("This test case overrides default configuration with " + TEST_CONF_FILE.getPath());
+		return TEST_CONF_FILE;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test/scripts/functions/codegen/CompressedOuterProductMain.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/CompressedOuterProductMain.R b/src/test/scripts/functions/codegen/CompressedOuterProductMain.R
new file mode 100644
index 0000000..e457645
--- /dev/null
+++ b/src/test/scripts/functions/codegen/CompressedOuterProductMain.R
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+library("matrixStats")
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+
+U = matrix(0.0001, nrow(X), 10);
+V = matrix(0.0001, ncol(X), 10);
+
+R1 = X * (7 + (U %*% t(V) + 3));
+R2 = (X * (7 + (U %*% t(V) + 3))) %*% V;
+R3 = t(U) %*% (X * (7 + (U %*% t(V) + 3)));
+R = as.matrix(sum(R1 * (7 + (R2 %*% R3 + 3))/1e6));
+
+writeMM(as(R,"CsparseMatrix"), paste(args[2], "R", sep=""));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test/scripts/functions/codegen/CompressedOuterProductMain.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/CompressedOuterProductMain.dml b/src/test/scripts/functions/codegen/CompressedOuterProductMain.dml
new file mode 100644
index 0000000..c7596e8
--- /dev/null
+++ b/src/test/scripts/functions/codegen/CompressedOuterProductMain.dml
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+
+U = matrix(0.0001, nrow(X), 10);
+V = matrix(0.0001, ncol(X), 10);
+
+R1 = X * (7 + (U %*% t(V) + 3));
+R2 = (X * (7 + (U %*% t(V) + 3))) %*% V;
+R3 = t(U) %*% (X * (7 + (U %*% t(V) + 3)));
+if(1==1) {}
+R = as.matrix(sum(R1 * (7 + (R2 %*% R3 + 3))/1e6));
+
+write(R, $2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/9dad2fe8/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
index bc5c21f..ea8a1f1 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
@@ -35,6 +35,7 @@ import org.junit.runners.Suite;
 	CellwiseTmplTest.class,
 	CompressedCellwiseTest.class,
 	CompressedMultiAggregateTest.class,
+	CompressedOuterProductTest.class,
 	DAGCellwiseTmplTest.class,
 	MultiAggTmplTest.class,
 	OuterProdTmplTest.class,