You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by mb...@apache.org on 2017/02/27 18:36:03 UTC

[1/9] incubator-systemml git commit: [SYSTEMML-1285] New basic code generator for operator fusion

Repository: incubator-systemml
Updated Branches:
  refs/heads/master b78c12593 -> bbefe96b2


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
new file mode 100644
index 0000000..c73456d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
+
+/**
+ * This library contains all vector primitives that are used in 
+ * generated source code for fused operators. For primitives that
+ * exist in LibMatrixMult, these calls are simply forwarded to
+ * ensure consistency in performance and result correctness. 
+ *
+ */
+public class LibSpoofPrimitives 
+{
+	// forwarded calls to LibMatrixMult
+	
+	public static double dotProduct( double[] a, double[] b, int ai, int bi, final int len ) {
+		return LibMatrixMult.dotProduct(a, b, ai, bi, len);
+	}
+	
+	public static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len ) {
+		return LibMatrixMult.dotProduct(a, b, aix, ai, bi, len);
+	}
+	
+	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len ) {
+		LibMatrixMult.vectMultiplyAdd(aval, b, c, bi, ci, len);
+	}
+	
+	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len ) {
+		LibMatrixMult.vectMultiplyAdd(aval, b, c, bix, bi, ci, len);
+	}
+	
+	public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len ) {
+		LibMatrixMult.vectMultiplyWrite(aval, b, c, bi, ci, len);
+	}
+
+	// custom methods
+	
+	/**
+	 * Computes c = sum(A), where A is a dense vectors. 
+	 * 
+	 * @param a dense input vector A
+	 * @param ai start position in A
+	 * @param len number of processed elements
+	 * @return sum value
+	 */
+	public static double vectSum( double[] a, int ai, final int len ) { 
+		double val = 0;
+		final int bn = len%8;
+				
+		//compute rest
+		for( int i = 0; i < bn; i++, ai++ )
+			val += a[ ai ];
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int i = bn; i < len; i+=8, ai+=8 ) {
+			//read 64B cacheline of a, compute cval' = sum(a) + cval
+			val += a[ ai+0 ] + a[ ai+1 ] + a[ ai+2 ] + a[ ai+3 ]
+			     + a[ ai+4 ] + a[ ai+5 ] + a[ ai+6 ] + a[ ai+7 ];
+		}
+		
+		//scalar result
+		return val; 
+	} 
+	
+	/**
+	 * Computes c = sum(A), where A is a sparse vector. 
+	 * 
+	 * @param avals sparse input vector A values A
+	 * @param aix sparse input vector A column indexes
+	 * @param ai start position in A
+	 * @param len number of processed elements
+	 * @return sum value
+	 */
+	public static double vectSum( double[] avals, int[] aix, int ai, int len) {
+		double val = 0;
+		final int bn = len%8;
+				
+		//compute rest
+		for( int i = ai; i < ai+bn; i++ )
+			val += avals[ ai+aix[i] ];
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int i = ai+bn; i < ai+len; i+=8 )
+		{
+			//read 64B of a via 'gather'
+			//compute cval' = sum(a) + cval
+			val += avals[ ai+aix[i+0] ] + avals[ ai+aix[i+1] ]
+			     + avals[ ai+aix[i+2] ] + avals[ ai+aix[i+3] ]
+			     + avals[ ai+aix[i+4] ] + avals[ ai+aix[i+5] ]
+			     + avals[ ai+aix[i+6] ] + avals[ ai+aix[i+7] ];
+		}
+		
+		//scalar result
+		return val; 
+	} 
+	
+	/**
+	 * Computes C += A / b, where C and A are dense vectors and b is a scalar. 
+	 * 
+	 * @param a dense input vector A
+	 * @param bval input scalar b 
+	 * @param c dense input-output vector C
+	 * @param ai start position in A
+	 * @param ci start position in C
+	 * @param len number of processed elements.
+	 */
+	public static void vectDivAdd( double[] a, final double bval, double[] c, int ai, int ci, final int len )
+	{
+		final int bn = len%8;
+		
+		//rest, not aligned to 8-blocks
+		for( int j = 0; j < bn; j++, ai++, ci++)
+			c[ ci ] +=  a[ ai ] / bval;
+		
+		//unrolled 8-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
+		{
+			//read 64B cachelines of b and c
+			//compute c' = aval * b + c
+			//write back 64B cacheline of c = c'
+			c[ ci+0 ] += a[ ai+0 ] / bval;
+			c[ ci+1 ] += a[ ai+1 ] / bval;
+			c[ ci+2 ] += a[ ai+2 ] / bval;
+			c[ ci+3 ] += a[ ai+3 ] / bval;
+			c[ ci+4 ] += a[ ai+4 ] / bval;
+			c[ ci+5 ] += a[ ai+5 ] / bval;
+			c[ ci+6 ] += a[ ai+6 ] / bval;
+			c[ ci+7 ] += a[ ai+7 ] / bval;
+		}
+	} 
+	
+	/**
+	 * Computes C += A / b, where C is a dense vector, A is a sparse vector, and b is a scalar. 
+	 * 
+	 * @param a sparse input vector A values
+	 * @param bval input scalar b 
+	 * @param c dense input-output vector C
+	 * @param aix sparse input vector A column indexes
+	 * @param ai start position in A
+	 * @param ci start position in C
+	 * @param len number of processed elements.
+	 */
+	public static void vectDivAdd( double[] a, final double bval, double[] c, int[] aix, final int ai, final int ci, final int len )
+	{
+		final int bn = len%8;
+		
+		//rest, not aligned to 8-blocks
+		for( int j = ai; j < ai+bn; j++ )
+			c[ ci + aix[j] ] += a[ j ] / bval;
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int j = ai+bn; j < ai+len; j+=8 )
+		{
+			//read 64B cacheline of b
+			//read 64B of c via 'gather'
+			//compute c' = aval * b + c
+			//write back 64B of c = c' via 'scatter'
+			c[ ci+aix[j+0] ] += a[ j+0 ] / bval;
+			c[ ci+aix[j+1] ] += a[ j+1 ] / bval;
+			c[ ci+aix[j+2] ] += a[ j+2 ] / bval;
+			c[ ci+aix[j+3] ] += a[ j+3 ] / bval;
+			c[ ci+aix[j+4] ] += a[ j+4 ] / bval;
+			c[ ci+aix[j+5] ] += a[ j+5 ] / bval;
+			c[ ci+aix[j+6] ] += a[ j+6 ] / bval;
+			c[ ci+aix[j+7] ] += a[ j+7 ] / bval;
+		}
+	}
+	
+	/**
+	 * Computes C = A / b, where C and A are dense vectors, and b is a scalar. 
+	 * 
+	 * @param a dense input vector A
+	 * @param bval input scalar b 
+	 * @param c dense input-output vector C
+	 * @param ai start position in A
+	 * @param ci start position in C
+	 * @param len number of processed elements.
+	 */
+	public static void vectDivWrite( double[] a, final double bval, double[] c, int ai, int ci, final int len )
+	{
+		final int bn = len%8;
+		
+		//rest, not aligned to 8-blocks
+		for( int j = 0; j < bn; j++, ai++, ci++)
+			c[ ci ] = a[ ai ] / bval;
+		
+		//unrolled 8-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=8, ai+=8, ci+=8) 
+		{
+			//read 64B cachelines of a and c
+			//compute c' = a / bval + c
+			//write back 64B cacheline of c = c'
+			c[ ci+0 ] = a[ ai+0 ] / bval;
+			c[ ci+1 ] = a[ ai+1 ] / bval;
+			c[ ci+2 ] = a[ ai+2 ] / bval;
+			c[ ci+3 ] = a[ ai+3 ] / bval;
+			c[ ci+4 ] = a[ ai+4 ] / bval;
+			c[ ci+5 ] = a[ ai+5 ] / bval;
+			c[ ci+6 ] = a[ ai+6 ] / bval;
+			c[ ci+7 ] = a[ ai+7 ] / bval;
+		}
+	}
+	
+	/**
+	 * Computes C = A / b, where C is a dense vector and A is a sparse vector, and b is a scalar. 
+	 * 
+	 * @param a sparse input vector A values
+	 * @param aix sparse input vector A column indexes
+	 * @param bval input scalar b 
+	 * @param c dense input-output vector C
+	 * @param ai start position in A
+	 * @param ci start position in C
+	 * @param len number of processed elements.
+	 */
+	public static void vectDivWrite( double[] a, int[] aix, final double bval, double[] c, final int ai, final int ci, final int len )
+	{
+		final int bn = len%8;
+		
+		//rest, not aligned to 8-blocks
+		for( int j = ai; j < ai+bn; j++ )
+			c[ ci + aix[j] ] += a[ j ] / bval;
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int j = ai+bn; j < ai+len; j+=8 )
+		{
+			//read 64B cachelines of a, compute c = a/b
+			//and write back c via 'scatter'
+			c[ ci+aix[j+0] ] = a[ j+0 ] / bval;
+			c[ ci+aix[j+1] ] = a[ j+1 ] / bval;
+			c[ ci+aix[j+2] ] = a[ j+2 ] / bval;
+			c[ ci+aix[j+3] ] = a[ j+3 ] / bval;
+			c[ ci+aix[j+4] ] = a[ j+4 ] / bval;
+			c[ ci+aix[j+5] ] = a[ j+5 ] / bval;
+			c[ ci+aix[j+6] ] = a[ j+6 ] / bval;
+			c[ ci+aix[j+7] ] = a[ j+7 ] / bval;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
new file mode 100644
index 0000000..51c5164
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofCellwise.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.io.Serializable;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.KahanPlus;
+import org.apache.sysml.runtime.instructions.cp.DoubleObject;
+import org.apache.sysml.runtime.instructions.cp.KahanObject;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+public abstract class SpoofCellwise extends SpoofOperator implements Serializable
+{
+	private static final long serialVersionUID = 3442528770573293590L;
+	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
+	
+	public enum CellType {
+		NO_AGG,
+		FULL_AGG,
+		ROW_AGG,
+	}
+	
+	protected CellType _type = CellType.NO_AGG;
+	
+	public SpoofCellwise() {
+
+	}
+	
+	public CellType getCellType() {
+		return _type;
+	}
+	
+	@Override
+	public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, int k) 
+		throws DMLRuntimeException 
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 1  )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		if( inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
+			k = 1; //serial execution
+		}
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();	
+		double sum = 0;
+		if( k <= 1 ) //SINGLE-THREADED
+		{
+			sum = ( !inputs.get(0).isInSparseFormat() ) ?
+				executeDenseAndAgg(inputs.get(0).getDenseBlock(), b, scalars, n, m, 0, m) :
+				executeSparseAndAgg(inputs.get(0).getSparseBlock(), b, scalars, n, m, 0, m);
+		}
+		else  //MULTI-THREADED
+		{
+			try {
+				ExecutorService pool = Executors.newFixedThreadPool( k );
+				ArrayList<ParAggTask> tasks = new ArrayList<ParAggTask>();
+				int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+				int blklen = (int)(Math.ceil((double)m/nk));
+				for( int i=0; i<nk & i*blklen<m; i++ )
+					tasks.add(new ParAggTask(inputs.get(0), b, scalars, n, m,i*blklen, Math.min((i+1)*blklen, m))); 
+				//execute tasks
+				List<Future<Double>> taskret = pool.invokeAll(tasks);	
+				pool.shutdown();
+			
+				//aggregate partial results
+				KahanObject kbuff = new KahanObject(0, 0);
+				KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+				for( Future<Double> task : taskret )
+					kplus.execute2(kbuff, task.get());
+				sum = kbuff._sum;
+			}
+			catch(Exception ex) {
+				throw new DMLRuntimeException(ex);
+			}
+		}
+		return new DoubleObject(sum);
+	}
+
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) 
+		throws DMLRuntimeException
+	{
+		execute(inputs, scalarObjects, out, 1);
+	}
+	
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 1 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		if( inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
+			k = 1; //serial execution
+		}
+		
+		//result allocation and preparations
+		out.reset(inputs.get(0).getNumRows(), _type == CellType.NO_AGG ? 
+				inputs.get(0).getNumColumns() : 1, false);
+		out.allocateDenseBlock();
+		double[] c = out.getDenseBlock();
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();		
+		
+		long lnnz = 0;
+		if( k <= 1 ) //SINGLE-THREADED
+		{
+			lnnz = (!inputs.get(0).isInSparseFormat()) ?
+				executeDense(inputs.get(0).getDenseBlock(), b, scalars, c, n, m, 0, m) :
+				executeSparse(inputs.get(0).getSparseBlock(), b, scalars, c, n, m, 0, m);
+		}
+		else  //MULTI-THREADED
+		{
+			try {
+				ExecutorService pool = Executors.newFixedThreadPool( k );
+				ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
+				int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+				int blklen = (int)(Math.ceil((double)m/nk));
+				for( int i=0; i<nk & i*blklen<m; i++ )
+					tasks.add(new ParExecTask(inputs.get(0), b, scalars, c, 
+						n, m, i*blklen, Math.min((i+1)*blklen, m))); 
+				//execute tasks
+				List<Future<Long>> taskret = pool.invokeAll(tasks);	
+				pool.shutdown();
+				
+				//aggregate nnz and error handling
+				for( Future<Long> task : taskret )
+					lnnz += task.get();
+			}
+			catch(Exception ex) {
+				throw new DMLRuntimeException(ex);
+			}
+		}
+		
+		//post-processing
+		out.setNonZeros(lnnz);	
+		out.examSparsity();	
+	}
+	
+	/**
+	 * 
+	 * @param a
+	 * @param b
+	 * @param c
+	 * @param n
+	 * @param rl
+	 * @param ru
+	 */
+	private double executeDenseAndAgg(double[] a, double[][] b, double[] scalars, int n, int m, int rl, int ru) 
+	{
+		KahanObject kbuff = new KahanObject(0, 0);
+		KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+
+		if( a == null ) { //empty
+			//note: we can't determine sparse-safeness by executing the operator once 
+			//as the output might change with different row indices
+			for( int i=rl; i<ru; i++ ) 
+				for( int j=0; j<n; j++ )
+					kplus.execute2(kbuff, genexecDense( 0, b, scalars, n, m, i, j ));
+		}
+		else { //general case
+			for( int i=rl, ix=rl*n; i<ru; i++ ) 
+				for( int j=0; j<n; j++, ix++ )
+					kplus.execute2(kbuff, genexecDense( a[ix], b, scalars, n, m, i, j ));
+		}
+		
+		return kbuff._sum;
+	}
+	
+	private long executeDense(double[] a, double[][] b,double[] scalars, double[] c, int n, int m, int rl, int ru) 
+	{
+		long lnnz = 0;
+		
+		if( _type == CellType.NO_AGG )
+		{
+			if( a == null ) { //empty
+				//note: we can't determine sparse-safeness by executing the operator once 
+				//as the output might change with different row indices
+				for( int i=rl, ix=rl*n; i<ru; i++ ) 
+					for( int j=0; j<n; j++, ix++ ) {
+						c[ix] = genexecDense( 0, b, scalars, n, m, i, j ); 
+						lnnz += (c[ix]!=0) ? 1 : 0;
+					}
+			}
+			else { //general case
+				for( int i=rl, ix=rl*n; i<ru; i++ ) 
+					for( int j=0; j<n; j++, ix++ ) {
+						c[ix] = genexecDense( a[ix], b, scalars, n, m, i, j); 
+						lnnz += (c[ix]!=0) ? 1 : 0;
+					}
+			}
+		}
+		else if( _type == CellType.ROW_AGG )
+		{
+			KahanObject kbuff = new KahanObject(0, 0);
+			KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+
+			if( a == null ) { //empty
+				//note: we can't determine sparse-safeness by executing the operator once 
+				//as the output might change with different row indices
+				for( int i=rl; i<ru; i++ ) { 
+					kbuff.set(0, 0);
+					for( int j=0; j<n; j++ )
+						kplus.execute2(kbuff, genexecDense( 0, b, scalars, n, m, i, j ));
+					c[i] = kbuff._sum;
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+			else { //general case
+				for( int i=rl, ix=rl*n; i<ru; i++ ) {
+					kbuff.set(0, 0);
+					for( int j=0; j<n; j++, ix++ )
+						kplus.execute2(kbuff, genexecDense( a[ix], b, scalars, n, m, i, j ));
+					c[i] = kbuff._sum;
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+		}
+		
+		return lnnz;
+	}
+	
+	private double executeSparseAndAgg(SparseBlock sblock, double[][] b, double[] scalars, int n, int m, int rl, int ru) 
+	{
+		KahanObject kbuff = new KahanObject(0, 0);
+		KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+		
+		//TODO rework sparse safe test
+		double val = genexecDense( 0, b, scalars, n, m, 0, 0 );
+		
+		if(val == 0 && b.length==0) // sparse safe
+		{
+			if( sblock != null ) {
+				for( int i=rl; i<ru; i++ )
+					if( !sblock.isEmpty(i) ) {
+						int apos = sblock.pos(i);
+						int alen = sblock.size(i);
+						double[] avals = sblock.values(i);
+						for( int j=apos; j<apos+alen; j++ ) {
+							kplus.execute2( kbuff, genexecDense(avals[j], b, scalars, n, m, i, j)); 
+						}
+					}	
+			}
+		}
+		else //sparse-unsafe
+		{
+			for(int i=rl; i<ru; i++)
+				for(int j=0; j<n; j++) {
+					double valij = (sblock != null) ? sblock.get(i, j) : 0;
+					kplus.execute2( kbuff, genexecDense(valij, b, scalars, n, m, i, j)); 
+				}
+		}
+		
+		return kbuff._sum;
+	}
+	
+	private long executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int m, int rl, int ru) 
+	{
+		//TODO rework sparse safe test
+		double val0 = genexecDense( 0, b, scalars, n, m, 0, 0 );
+		long lnnz = 0;
+		
+		if( _type == CellType.NO_AGG )
+		{
+			if(val0 == 0 && b.length == 0) // sparse safe
+			{
+				if( sblock != null ) {
+					for( int i=rl; i<ru; i++ )
+						if( !sblock.isEmpty(i) ) {
+							int apos = sblock.pos(i);
+							int alen = sblock.size(i);
+							double[] avals = sblock.values(i);
+							for( int j=apos; j<apos+alen; j++ ) {
+								double val = genexecDense(avals[j], b, scalars, n, m, i, j);
+								c[i*n+sblock.indexes(i)[j]] = val;
+								lnnz += (val!=0) ? 1 : 0;
+							}
+						}
+				}
+			}
+			else //sparse-unsafe
+			{
+				for(int i=rl, cix=rl*n; i<ru; i++, cix+=n)
+					for(int j=0; j<n; j++) {
+						double valij = (sblock != null) ? sblock.get(i, j) : 0;
+						c[cix+j] = genexecDense(valij, b, scalars, n, m, i, j); 
+						lnnz += (c[cix+j]!=0) ? 1 : 0;
+					}
+			}
+		}
+		else if( _type == CellType.ROW_AGG ) 
+		{
+			KahanObject kbuff = new KahanObject(0, 0);
+			KahanPlus kplus = KahanPlus.getKahanPlusFnObject();
+
+			if(val0 == 0 && b.length == 0) // sparse safe
+			{
+				if( sblock != null ) {
+					for( int i=rl; i<ru; i++ ) {
+						if( sblock.isEmpty(i) ) continue;
+						kbuff.set(0, 0);
+						int apos = sblock.pos(i);
+						int alen = sblock.size(i);
+						double[] avals = sblock.values(i);
+						for( int j=apos; j<apos+alen; j++ ) {
+							kplus.execute2(kbuff, genexecDense(avals[j], b, scalars, n, m, i, j));
+						}
+						c[i] = kbuff._sum; 
+						lnnz += (c[i]!=0) ? 1 : 0;	
+					}
+				}
+			}
+			else //sparse-unsafe
+			{
+				for(int i=rl; i<ru; i++) {
+					kbuff.set(0, 0);
+					for(int j=0; j<n; j++) {
+						double valij = (sblock != null) ? sblock.get(i, j) : 0;
+						kplus.execute2( kbuff, genexecDense(valij, b, scalars, n, m, i, j)); 
+					}
+					c[i] = kbuff._sum;
+					lnnz += (c[i]!=0) ? 1 : 0;
+				}
+			}
+		}
+		
+		return lnnz;
+	}
+
+	protected abstract double genexecDense( double a, double[][] b, double[] scalars, int n, int m, int rowIndex, int colIndex);
+	
+	private class ParAggTask implements Callable<Double> 
+	{
+		private final MatrixBlock _a;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final int _clen;
+		private final int _rlen;
+		private final int _rl;
+		private final int _ru;
+
+		protected ParAggTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rlen, int rl, int ru ) {
+			_a = a;
+			_b = b;
+			_scalars = scalars;
+			_clen = clen;
+			_rlen = rlen;
+			_rl = rl;
+			_ru = ru;
+		}
+		
+		@Override
+		public Double call() throws DMLRuntimeException {
+			return ( !_a.isInSparseFormat()) ?
+				executeDenseAndAgg(_a.getDenseBlock(), _b, _scalars, _clen, _rlen, _rl, _ru) :
+				executeSparseAndAgg(_a.getSparseBlock(), _b, _scalars, _clen, _rlen, _rl, _ru);
+		}
+	}
+
+	private class ParExecTask implements Callable<Long> 
+	{
+		private final MatrixBlock _a;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final double[] _c;
+		private final int _clen;
+		private final int _rlen;
+		private final int _rl;
+		private final int _ru;
+
+		protected ParExecTask( MatrixBlock a, double[][] b, double[] scalars, double[] c, int clen, int rlen, int rl, int ru ) {
+			_a = a;
+			_b = b;
+			_scalars = scalars;
+			_c = c;
+			_clen = clen;
+			_rlen = rlen;
+			_rl = rl;
+			_ru = ru;
+		}
+		
+		@Override
+		public Long call() throws DMLRuntimeException {
+			return (!_a.isInSparseFormat()) ?
+					executeDense(_a.getDenseBlock(), _b, _scalars, _c, _clen, _rlen, _rl, _ru) :
+					executeSparse(_a.getSparseBlock(), _b, _scalars,  _c, _clen, _rlen, _rl, _ru);
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
new file mode 100644
index 0000000..ddbf96d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOperator.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+public abstract class SpoofOperator implements Serializable
+{
+	private static final long serialVersionUID = 3834006998853573319L;
+
+	public abstract void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out) 
+		throws DMLRuntimeException;
+	
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, MatrixBlock out, int k) 
+		throws DMLRuntimeException 
+	{
+		//default implementation serial execution
+		execute(inputs, scalars, out);
+	}
+	
+	public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars) throws DMLRuntimeException {
+		throw new RuntimeException("Invalid invocation in base class.");
+	}
+	
+	public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalars, int k) 
+		throws DMLRuntimeException 
+	{
+		//default implementation serial execution
+		return execute(inputs, scalars);
+	}
+	
+	protected double[][] prepInputMatrices(ArrayList<MatrixBlock> inputs) {
+		return prepInputMatrices(inputs, 1);
+	}
+	
+	protected double[][] prepInputMatrices(ArrayList<MatrixBlock> inputs, int offset) {
+		double[][] b = new double[inputs.size()-offset][]; 
+		for(int i=offset; i < inputs.size(); i++) {
+			if( inputs.get(i).isEmptyBlock(false) && !inputs.get(i).isAllocated() )
+				inputs.get(i).allocateDenseBlock(); 
+			b[i-offset] = inputs.get(i).getDenseBlock();
+		}
+		return b;
+	}
+	
+	protected double[] prepInputScalars(ArrayList<ScalarObject> scalarObjects) {
+		double[] scalars = new double[scalarObjects.size()]; 
+		for(int i=0; i < scalarObjects.size(); i++)
+			scalars[i] = scalarObjects.get(i).getDoubleValue();
+		return scalars;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
new file mode 100644
index 0000000..a23ea5a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -0,0 +1,541 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.cp.DoubleObject;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+
+public abstract class SpoofOuterProduct extends SpoofOperator
+{
+	private static final long serialVersionUID = 2948612259863710279L;
+	
+	private static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
+	
+	public enum OutProdType {
+		LEFT_OUTER_PRODUCT,
+		RIGHT_OUTER_PRODUCT,
+		CELLWISE_OUTER_PRODUCT, // (e.g., X*log(sigmoid(-(U%*%t(V)))))  )
+		AGG_OUTER_PRODUCT		// (e.g.,sum(X*log(U%*%t(V)+eps)))   )
+	}
+	
+	protected OutProdType _outerProductType;
+	
+	public SpoofOuterProduct() {
+
+	}
+	
+	public void setOuterProdType(OutProdType type) {
+		_outerProductType = type;
+	}
+	
+	public OutProdType getOuterProdType() {
+		return _outerProductType;
+	}
+	
+	@Override
+	public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 3 )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs, 3);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();	
+		final int k = inputs.get(1).getNumColumns(); // rank
+		
+		//public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k)
+		MatrixBlock a = inputs.get(0);
+		MatrixBlock u = inputs.get(1);
+		MatrixBlock v = inputs.get(2);
+		
+		MatrixBlock out = new MatrixBlock(1, 1, false);
+		out.allocateDenseBlock();
+		
+		if(!a.isInSparseFormat())
+			executeCellwiseDense(a.getDenseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+		else
+			executeCellwiseSparse(a.getSparseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out, n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+		return new DoubleObject(out.getDenseBlock()[0]);
+	}
+	
+	@Override
+	public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, int numThreads)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 3 )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs, 3);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();	
+		final int k = inputs.get(1).getNumColumns(); // rank
+		double sum = 0;
+		
+		try 
+		{			
+			ExecutorService pool = Executors.newFixedThreadPool(k);
+			ArrayList<ParOuterProdAggTask> tasks = new ArrayList<ParOuterProdAggTask>();			
+			//create tasks (for wdivmm-left, parallelization over columns;
+			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
+			int blklen = (int)(Math.ceil((double)m/numThreads));
+			for( int i=0; i<numThreads & i*blklen<m; i++ )
+				tasks.add(new ParOuterProdAggTask(inputs.get(0), inputs.get(1).getDenseBlock(), inputs.get(2).getDenseBlock(), b, scalars, n, m, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+			//execute tasks
+			List<Future<Double>> taskret = pool.invokeAll(tasks);
+			pool.shutdown();
+			for( Future<Double> task : taskret )
+				sum += task.get();
+		} 
+		catch (Exception e) {
+			throw new DMLRuntimeException(e);
+		} 
+		
+		return new DoubleObject(sum);	
+	}
+	
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 3 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//check empty result
+		if(   (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) //U is empty
+				   || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT &&  inputs.get(2).isEmptyBlock(false)) //V is empty
+				   || (_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT && inputs.get(0).isEmptyBlock(false))) {  //X is empty
+					out.examSparsity(); //turn empty dense into sparse
+					return; 
+		}
+		
+		//input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) 
+		if(_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) {
+			//assign it to the time and sparse representation of the major input matrix
+			out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat());
+			out.allocateDenseOrSparseBlock();
+		}		
+		else {	
+			//if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
+			//if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT &&  out.getNumRows() == inputs.get(2).getNumColumns() &&  out.getNumColumns() == inputs.get(2).getNumRows())
+			if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns()); // n*k
+			else if(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns()); // m*k
+			out.allocateDenseBlock();
+		}			
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs, 3);
+		double[] scalars = prepInputScalars(scalarObjects);
+				
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();	
+		final int k = inputs.get(1).getNumColumns(); // rank
+		
+		MatrixBlock a = inputs.get(0);
+		MatrixBlock u = inputs.get(1);
+		MatrixBlock v = inputs.get(2);
+		
+		switch(_outerProductType) {
+			case LEFT_OUTER_PRODUCT:	
+			case RIGHT_OUTER_PRODUCT:
+				if( !a.isInSparseFormat() )
+					executeDense(a.getDenseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+				else
+					executeSparse(a.getSparseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out.getDenseBlock(), n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+				break;
+				
+			case CELLWISE_OUTER_PRODUCT:
+				if( !a.isInSparseFormat() )
+					executeCellwiseDense(a.getDenseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out.getDenseBlock(), n, m, k, _outerProductType, 0, m, 0, n);
+				else 
+					executeCellwiseSparse(a.getSparseBlock(), u.getDenseBlock(), v.getDenseBlock(), b, scalars, out, n, m, k, (int) a.getNonZeros(), _outerProductType, 0, m, 0, n);
+				break;
+	
+			case AGG_OUTER_PRODUCT:
+				throw new DMLRuntimeException("Wrong codepath for aggregate outer product.");	
+		}
+		
+		//post-processing
+		out.recomputeNonZeros();
+		out.examSparsity();
+	}
+	
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 3 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//check empty result
+		if(   (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) //U is empty
+				   || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT &&  inputs.get(2).isEmptyBlock(false)) //V is empty
+				   || (_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT && inputs.get(0).isEmptyBlock(false))) {  //X is empty
+					out.examSparsity(); //turn empty dense into sparse
+					return; 
+		}
+		
+		//input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) 
+		if(_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT)
+		{
+			//assign it to the time and sparse representation of the major input matrix
+			out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat());
+			out.allocateDenseOrSparseBlock();
+		}
+		else
+		{
+			//if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output
+			//if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT &&  out.getNumRows() == inputs.get(2).getNumColumns() &&  out.getNumColumns() == inputs.get(2).getNumRows())
+			if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns()); // n*k
+			else if(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT )
+				out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns()); // m*k
+			out.allocateDenseBlock();
+		}	
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs, 3);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();	
+		final int k = inputs.get(1).getNumColumns(); // rank
+		
+		try 
+		{			
+			ExecutorService pool = Executors.newFixedThreadPool(numThreads);
+			ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();			
+			//create tasks (for wdivmm-left, parallelization over columns;
+			//for wdivmm-right, parallelization over rows; both ensure disjoint results)
+			
+			if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ) {
+				int blklen = (int)(Math.ceil((double)n/numThreads));
+				for( int j=0; j<numThreads & j*blklen<n; j++ )
+					tasks.add(new ParExecTask(inputs.get(0), inputs.get(1).getDenseBlock(), inputs.get(2).getDenseBlock(), b, scalars, out, n, m, k, _outerProductType,  0, m, j*blklen, Math.min((j+1)*blklen, n)));
+			}
+			else { ///right // cellwise
+				int blklen = (int)(Math.ceil((double)m/numThreads));
+				for( int i=0; i<numThreads & i*blklen<m; i++ )
+					tasks.add(new ParExecTask(inputs.get(0), inputs.get(1).getDenseBlock(), inputs.get(2).getDenseBlock(), b, scalars, out, n, m, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+			}
+			List<Future<Long>> taskret = pool.invokeAll(tasks);
+			pool.shutdown();
+			for( Future<Long> task : taskret )
+				out.setNonZeros(out.getNonZeros() + task.get());
+		} 
+		catch (Exception e) {
+			throw new DMLRuntimeException(e);
+		} 
+		
+		//post-processing
+		out.examSparsity();
+	}
+	
+	private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
+	{
+		//approach: iterate over non-zeros of w, selective mm computation
+		//cache-conscious blocking: due to blocksize constraint (default 1000),
+		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
+		
+		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
+		int cix = 0;
+		//blocked execution
+		for( int bi = rl; bi < ru; bi+=blocksizeIJ )
+			for( int bj = cl, bimin = Math.min(ru, bi+blocksizeIJ); bj < cu; bj+=blocksizeIJ ) 
+			{
+				int bjmin = Math.min(cu, bj+blocksizeIJ);
+						
+				//core computation
+				for( int i=bi, ix=bi*n, uix=bi*k; i<bimin; i++, ix+=n, uix+=k )
+					for( int j=bj, vix=bj*k; j<bjmin; j++, vix+=k)
+						if( a[ix+j] != 0 ) {
+							cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix;
+							genexecDense( a[ix+j], u, uix, v, vix, b, scalars, c, cix, n, m, k, i,j);//(ix+j)/n, (ix+j)%n ); 
+						}
+			}
+	}
+	
+	private void executeCellwiseDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, OutProdType type, int rl, int ru, int cl, int cu ) 
+	{
+		//approach: iterate over non-zeros of w, selective mm computation
+		//cache-conscious blocking: due to blocksize constraint (default 1000),
+		//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) 
+		
+		final int blocksizeIJ = 16; //u/v block (max at typical L2 size) 
+		//blocked execution
+		for( int bi = rl; bi < ru; bi+=blocksizeIJ )
+			for( int bj = cl, bimin = Math.min(ru, bi+blocksizeIJ); bj < cu; bj+=blocksizeIJ ) 
+			{
+				int bjmin = Math.min(cu, bj+blocksizeIJ);
+						
+				//core computation
+				for( int i=bi, ix=bi*n, uix=bi*k; i<bimin; i++, ix+=n, uix+=k )
+					for( int j=bj, vix=bj*k; j<bjmin; j++, vix+=k)
+						if( a[ix+j] != 0 ) {
+							//int cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix;
+							if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
+								c[ix+j] = genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, n, m, k, i, j ); 
+							else
+								c[0]  += genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, n, m, k, i, j); // (ix+j)/n, (ix+j)%n ); 	
+						}
+			}
+	}
+	
+	private void executeSparse(SparseBlock sblock,  double[] u, double[] v, double[][] b, double[] scalars , double[] c, int n, int m, int k, int nnz, OutProdType type, int rl, int ru, int cl, int cu) 
+	{
+		boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT);
+		
+		//approach: iterate over non-zeros of w, selective mm computation
+		//blocked over ij, while maintaining front of column indexes, where the
+		//blocksize is chosen such that we reuse each  Ui/Vj vector on average 8 times,
+		//with custom blocksizeJ for wdivmm_left to avoid LLC misses on output.
+		final int blocksizeI = (int) (8L*m*n/nnz);
+		final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
+		int[] curk = new int[blocksizeI];
+		
+		for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+		{
+			int bimin = Math.min(ru, bi+blocksizeI);
+			//prepare starting indexes for block row
+			for( int i=bi; i<bimin; i++ ) {
+				int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl);
+				curk[i-bi] = (index>=0) ? index : n;
+			}
+			
+			//blocked execution over column blocks
+			for( int bj = cl; bj < cu; bj+=blocksizeJ ) 
+			{
+				int bjmin = Math.min(cu, bj+blocksizeJ);
+				//core wdivmm block matrix mult
+				for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+					if( sblock.isEmpty(i) ) continue;
+					
+					int wpos = sblock.pos(i);
+					int wlen = sblock.size(i);
+					int[] wix = sblock.indexes(i);
+					double[] wval = sblock.values(i);				
+
+					int index = wpos + curk[i-bi];
+					for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+						genexecDense( wval[index], u, uix, v, wix[index]*k, b, scalars, c, 
+								(left ? wix[index]*k : uix), n, m, k, i, wix[index] );
+					}
+					curk[i-bi] = index - wpos;
+				}
+			}
+		}
+	}
+	
+	private void executeCellwiseSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock out, int n, int m, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) 
+	{
+		final int blocksizeIJ = (int) (8L*m*n/nnz); 
+		int[] curk = new int[blocksizeIJ];			
+		
+		if( !out.isInSparseFormat() ) //DENSE
+		{
+			double[] c = out.getDenseBlock();
+			for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
+				int bimin = Math.min(ru, bi+blocksizeIJ);
+				//prepare starting indexes for block row
+				Arrays.fill(curk, 0); 
+				//blocked execution over column blocks
+				for( int bj=0; bj<n; bj+=blocksizeIJ ) {
+					int bjmin = Math.min(n, bj+blocksizeIJ);
+					for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+						if( sblock.isEmpty(i) ) continue;
+						int wpos = sblock.pos(i);
+						int wlen = sblock.size(i);
+						int[] wix = sblock.indexes(i);
+						double[] wval = sblock.values(i);
+						int index = wpos + curk[i-bi];
+						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+							if(type == OutProdType.CELLWISE_OUTER_PRODUCT)
+								c[index] = genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, n, m, k, i, wix[index] ); 
+							else
+								c[0] += genexecCellwise( wval[index], u, uix, v, wix[index]*k, b, scalars, n, m, k, i, wix[index]); // (ix+j)/n, (ix+j)%n );
+						}
+						curk[i-bi] = index - wpos;
+					}
+				}
+			}
+		}
+		else //SPARSE
+		{
+			SparseBlock c = out.getSparseBlock();
+			for( int bi=rl; bi<ru; bi+=blocksizeIJ ) {
+				int bimin = Math.min(ru, bi+blocksizeIJ);
+				//prepare starting indexes for block row
+				Arrays.fill(curk, 0); 
+				//blocked execution over column blocks
+				for( int bj=0; bj<n; bj+=blocksizeIJ ) {
+					int bjmin = Math.min(n, bj+blocksizeIJ);
+					for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) {
+						if( sblock.isEmpty(i) ) continue;
+						int wpos = sblock.pos(i);
+						int wlen = sblock.size(i);
+						int[] wix = sblock.indexes(i);
+						double[] wval = sblock.values(i);
+						int index = wpos + curk[i-bi];
+						for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) {
+							c.append(i, index, genexecCellwise( wval[index], u, uix, v, 
+									wix[index]*k, b, scalars, n, m, k, i, wix[index] )); 
+						}
+						curk[i-bi] = index - wpos;
+					}
+				}
+			}
+		}
+	}
+
+	protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, double[][] b, double[] scalars , double[] c, int ci, int n, int m, int k, int rowIndex, int colIndex );
+	
+	protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, double[][] b, double[] scalars , int n, int m, int k, int rowIndex, int colIndex);
+
+	private class ParExecTask implements Callable<Long> 
+	{
+		private final MatrixBlock _a;
+		private final double[] _u;
+		private final double[] _v;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final MatrixBlock _c;
+		private final int _clen;
+		private final int _rlen;
+		private final int _k;
+		private final OutProdType _type;
+		private final int _rl;
+		private final int _ru;
+		private final int _cl;
+		private final int _cu;
+		
+		protected ParExecTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock c, int clen, int rlen, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
+			_a = a;
+			_u = u;
+			_v = v;
+			_b = b;
+			_c = c;
+			_scalars = scalars;
+			_clen = clen;
+			_rlen = rlen;
+			_k = k;
+			_type = type;
+			_rl = rl;
+			_ru = ru;
+			_cl = cl;
+			_cu = cu;
+		}
+		
+		@Override
+		public Long call() throws DMLRuntimeException {
+			switch(_type)
+			{
+				case LEFT_OUTER_PRODUCT:	
+				case RIGHT_OUTER_PRODUCT:
+					if( !_a.isInSparseFormat() )
+						executeDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+					else
+						executeSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, (int) _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
+					break;
+				case CELLWISE_OUTER_PRODUCT:
+					if( !_c.isInSparseFormat() )
+						executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+					else 
+						executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c, _clen, _rlen, _k, (int) _a.getNonZeros(), _type,  _rl, _ru, _cl, _cu);
+					break;			
+				case AGG_OUTER_PRODUCT:
+					throw new DMLRuntimeException("Wrong codepath for aggregate outer product.");
+			}
+			
+			int rl = _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ? _cl : _rl;
+			int ru = _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ? _cu : _ru;
+			return _c.recomputeNonZeros(rl, ru-1, 0, _c.getNumColumns()-1);
+		}
+	}
+	
+	private class ParOuterProdAggTask implements Callable<Double> 
+	{
+		private final MatrixBlock _a;
+		private final double[] _u;
+		private final double[] _v;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final int _clen;
+		private final int _rlen;
+		private final int _k;
+		private final OutProdType _type;
+		private final int _rl;
+		private final int _ru;
+		private final int _cl;
+		private final int _cu;
+		
+		protected ParOuterProdAggTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, int clen, int rlen, int k, OutProdType type, int rl, int ru, int cl, int cu ) {
+			_a = a;
+			_u = u;
+			_v = v;
+			_b = b;
+			_scalars = scalars;
+			_clen = clen;
+			_rlen = rlen;
+			_k = k;
+			_type = type;
+			_rl = rl;
+			_ru = ru;
+			_cl = cl;
+			_cu = cu;
+		}
+		
+		@Override
+		public Double call() throws DMLRuntimeException {
+			MatrixBlock out = new MatrixBlock(1, 1, false);
+			out.allocateDenseBlock();
+			if(!_a.isInSparseFormat())
+				executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, out.getDenseBlock(), _clen, _rlen, _k, _type, _rl, _ru, _cl, _cu);
+			else
+				executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, out, _clen, _rlen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu);
+			return out.getDenseBlock()[0];
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
new file mode 100644
index 0000000..c0b58a8
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+
+public abstract class SpoofRowAggregate extends SpoofOperator
+{
+	private static final long serialVersionUID = 6242910797139642998L;
+	private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M elements
+	
+	protected boolean _colVector = false;
+	
+	public SpoofRowAggregate() {
+
+	}
+
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out)	
+		throws DMLRuntimeException
+	{
+		//sanity check
+		if( inputs==null || inputs.size() < 1 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//result allocation and preparations
+		out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
+			_colVector ? 1 : inputs.get(0).getNumColumns(), false);
+		out.allocateDenseBlock();
+		double[] c = out.getDenseBlock();
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core sequential execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();		
+		if( !inputs.get(0).isInSparseFormat() )
+			executeDense(inputs.get(0).getDenseBlock(), b, scalars, c, n, 0, m);
+		else
+			executeSparse(inputs.get(0).getSparseBlock(), b, scalars, c, n, 0, m);
+	
+		//post-processing
+		out.recomputeNonZeros();	
+	}
+	
+	@Override
+	public void execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)	
+		throws DMLRuntimeException
+	{
+		//redirect to serial execution
+		if( k <= 1 || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) {
+			execute(inputs, scalarObjects, out);
+			return;
+		}
+		
+		//sanity check
+		if( inputs==null || inputs.size() < 1 || out==null )
+			throw new RuntimeException("Invalid input arguments.");
+		
+		//result allocation and preparations
+		out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
+			_colVector ? 1 : inputs.get(0).getNumColumns(), false);
+		out.allocateDenseBlock();
+		
+		//input preparation
+		double[][] b = prepInputMatrices(inputs);
+		double[] scalars = prepInputScalars(scalarObjects);
+		
+		//core parallel execute
+		final int m = inputs.get(0).getNumRows();
+		final int n = inputs.get(0).getNumColumns();		
+		try {
+			ExecutorService pool = Executors.newFixedThreadPool( k );
+			ArrayList<ParExecTask> tasks = new ArrayList<ParExecTask>();
+			int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+			int blklen = (int)(Math.ceil((double)m/nk));
+			for( int i=0; i<nk & i*blklen<m; i++ )
+				tasks.add(new ParExecTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
+			//execute tasks
+			List<Future<double[]>> taskret = pool.invokeAll(tasks);	
+			pool.shutdown();
+			//aggregate partial results
+			for( Future<double[]> task : taskret )
+				LibMatrixMult.vectAdd(task.get(), out.getDenseBlock(), 0, 0, n);
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}
+		
+		//post-processing
+		out.recomputeNonZeros();	
+	}
+	
+	private void executeDense(double[] a, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	{
+		for( int i=rl, aix=rl*n; i<ru; i++, aix+=n ) {
+			//call generated method
+			genexecRowDense( a, aix, b, scalars, c, n, i );
+		}
+	}
+	
+	private void executeSparse(SparseBlock sblock, double[][] b, double[] scalars, double[] c, int n, int rl, int ru) 
+	{
+		for( int i=rl; i<ru; i++ ) {
+			if( !sblock.isEmpty(i) ) {
+				double[] avals = sblock.values(i);
+				int[] aix = sblock.indexes(i);
+				int apos = sblock.pos(i);
+				int alen = sblock.size(i);
+				
+				//call generated method
+				genexecRowSparse(avals, aix, apos, b, scalars, c, alen, i);
+			}
+		}
+	}
+	
+	//methods to be implemented by generated operators of type SpoofRowAggrgate 
+	
+	protected abstract void genexecRowDense( double[] a, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
+	
+	protected abstract void genexecRowSparse( double[] avals, int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
+
+	
+	/**
+	 * Task for multi-threaded operations.
+	 */
+	private class ParExecTask implements Callable<double[]> 
+	{
+		private final MatrixBlock _a;
+		private final double[][] _b;
+		private final double[] _scalars;
+		private final int _clen;
+		private final int _rl;
+		private final int _ru;
+
+		protected ParExecTask( MatrixBlock a, double[][] b, double[] scalars, int clen, int rl, int ru ) {
+			_a = a;
+			_b = b;
+			_scalars = scalars;
+			_clen = clen;
+			_rl = rl;
+			_ru = ru;
+		}
+		
+		@Override
+		public double[] call() throws DMLRuntimeException {
+			double[] c = new double[_clen];
+			if( !_a.isInSparseFormat() )
+				executeDense(_a.getDenseBlock(), _b, _scalars, c, _clen, _rl, _ru);
+			else
+				executeSparse(_a.getSparseBlock(), _b, _scalars, c, _clen, _rl, _ru);
+				
+			return c;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/util/IDSequence.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/util/IDSequence.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/util/IDSequence.java
index 1d173bb..24056dc 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/util/IDSequence.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/util/IDSequence.java
@@ -25,8 +25,6 @@ package org.apache.sysml.runtime.controlprogram.parfor.util;
  */
 public class IDSequence 
 {
-
-	
 	private long _current = -1;
 	private boolean wrapAround = false;
 	
@@ -60,22 +58,11 @@ public class IDSequence
 		return _current;
 	}
 	
-	public synchronized void reset()
-	{
-		_current = 0;
-	}
-	
-	/*
-	private AtomicLong _seq = new AtomicLong(0);
-	
-	public long getNextID()
-	{
-		return _seq.getAndIncrement();
+	public synchronized long getCurrentID() {
+		return _current;
 	}
 	
-	public void reset()
-	{
-		_seq = new AtomicLong( 0 );
+	public synchronized void reset() {
+		_current = 0;
 	}
-	*/
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/instructions/cp/RelationalBinaryCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/RelationalBinaryCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/RelationalBinaryCPInstruction.java
index c749764..dedfe56 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/RelationalBinaryCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/RelationalBinaryCPInstruction.java
@@ -28,9 +28,7 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
 
 public abstract class RelationalBinaryCPInstruction extends BinaryCPInstruction 
 {
-	
-	public RelationalBinaryCPInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr )
-	{
+	public RelationalBinaryCPInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr ) {
 		super(op, in1, in2, out, opcode, istr);
 		_cptype = CPINSTRUCTION_TYPE.RelationalBinary;
 	}
@@ -42,34 +40,19 @@ public abstract class RelationalBinaryCPInstruction extends BinaryCPInstruction
 		CPOperand out = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
 		String opcode = parseBinaryInstruction(str, in1, in2, out);
 		
-		// TODO: Relational operations need not have value type checking
 		ValueType vt1 = in1.getValueType();
 		DataType dt1 = in1.getDataType();
-		ValueType vt2 = in2.getValueType();
 		DataType dt2 = in2.getDataType();
 		DataType dt3 = out.getDataType();
 		
-		//if ( vt3 != ValueType.BOOLEAN )
-		//	throw new DMLRuntimeException("Unexpected ValueType in RelationalCPInstruction: " + str);
-		
-		if ( vt1 == ValueType.BOOLEAN && !opcode.equalsIgnoreCase("==") && !opcode.equalsIgnoreCase("!=") ) 
+		if( vt1 == ValueType.BOOLEAN && !opcode.equalsIgnoreCase("==") && !opcode.equalsIgnoreCase("!=") ) 
 			throw new DMLRuntimeException("Operation " + opcode + " can not be applied on boolean values "
 					 					  + "(Instruction = " + str + ").");
 		
-		//prithvi TODO
-		//make sure these checks belong here
-		//if either input is a matrix, then output
-		//has to be a matrix
-		if((dt1 == DataType.MATRIX 
-			|| dt2 == DataType.MATRIX) 
-		   && dt3 != DataType.MATRIX)
-			throw new DMLRuntimeException("Element-wise matrix operations between variables "
-										  + in1.getName()
-										  + " and "
-										  + in2.getName()
-										  + " must produce a matrix, which "
-										  + out.getName()
-										  + " is not");
+		// check for valid data type of output
+		if((dt1 == DataType.MATRIX || dt2 == DataType.MATRIX) && dt3 != DataType.MATRIX)
+			throw new DMLRuntimeException("Element-wise matrix operations between variables " + in1.getName() + 
+					" and " + in2.getName() + " must produce a matrix, which " + out.getName() + " is not");
 		
 		Operator operator = (dt1 != dt2) ?
 					InstructionUtils.parseScalarBinaryOperator(opcode, (dt1 == DataType.SCALAR)) : 
@@ -77,24 +60,11 @@ public abstract class RelationalBinaryCPInstruction extends BinaryCPInstruction
 		
 		//for scalar relational operations we only allow boolean operands
 		//or when both operands are numeric (int or double)
-		if(dt1 == DataType.SCALAR && dt2 == DataType.SCALAR){
-			if (!(  (vt1 == ValueType.BOOLEAN && vt2 == ValueType.BOOLEAN)
-				  ||(vt1 == ValueType.STRING && vt2 == ValueType.STRING)
-				  ||( (vt1 == ValueType.DOUBLE || vt1 == ValueType.INT) && (vt2 == ValueType.DOUBLE || vt2 == ValueType.INT))))
-			{
-				throw new DMLRuntimeException("unexpected value-type in "
-											  + "Relational Binary Instruction "
-											  + "involving scalar operands.");
-			}
+		if(dt1 == DataType.SCALAR && dt2 == DataType.SCALAR)
 			return new ScalarScalarRelationalCPInstruction(operator, in1, in2, out, opcode, str);
-		
-		}else if (dt1 == DataType.MATRIX || dt2 == DataType.MATRIX){
-			if(dt1 == DataType.MATRIX && dt2 == DataType.MATRIX)
-				return new MatrixMatrixRelationalCPInstruction(operator, in1, in2, out, opcode, str);
-			else
-				return new ScalarMatrixRelationalCPInstruction(operator, in1, in2, out, opcode, str);
-		}
-		
-		return null;
+		else if(dt1 == DataType.MATRIX && dt2 == DataType.MATRIX)
+			return new MatrixMatrixRelationalCPInstruction(operator, in1, in2, out, opcode, str);
+		else
+			return new ScalarMatrixRelationalCPInstruction(operator, in1, in2, out, opcode, str);
 	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/instructions/cp/ScalarScalarRelationalCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ScalarScalarRelationalCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ScalarScalarRelationalCPInstruction.java
index 68607fb..ca2b12e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ScalarScalarRelationalCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ScalarScalarRelationalCPInstruction.java
@@ -47,7 +47,8 @@ public class ScalarScalarRelationalCPInstruction extends RelationalBinaryCPInstr
 		
 		BinaryOperator dop = (BinaryOperator) _optr;
 		
-		if ( so1 instanceof IntObject && so2 instanceof IntObject ) {
+		if ( (so1 instanceof IntObject || so1 instanceof BooleanObject) 
+				&& (so2 instanceof IntObject || so2 instanceof BooleanObject)  ) {
 			boolean rval = dop.fn.compare ( so1.getLongValue(), so2.getLongValue() );
 			sores = (ScalarObject) new BooleanObject(rval); 
 		}
@@ -55,14 +56,6 @@ public class ScalarScalarRelationalCPInstruction extends RelationalBinaryCPInstr
 			boolean rval = dop.fn.compare ( so1.getDoubleValue(), so2.getDoubleValue() );
 			sores = (ScalarObject) new BooleanObject(rval); 
 		}
-		else if ( so1 instanceof IntObject && so2 instanceof DoubleObject) {
-			boolean rval = dop.fn.compare ( so1.getLongValue(), so2.getDoubleValue() );
-			sores = (ScalarObject) new BooleanObject(rval); 
-		}
-		else if ( so1 instanceof DoubleObject && so2 instanceof IntObject ) {
-			boolean rval = dop.fn.compare ( so1.getDoubleValue(), so2.getLongValue() );
-			sores = (ScalarObject) new BooleanObject(rval); 
-		}
 		else if ( so1 instanceof BooleanObject && so2 instanceof BooleanObject ) {
 			boolean rval = dop.fn.compare ( so1.getBooleanValue(), so2.getBooleanValue() );
 			sores = (ScalarObject) new BooleanObject(rval); 
@@ -71,7 +64,16 @@ public class ScalarScalarRelationalCPInstruction extends RelationalBinaryCPInstr
 			boolean rval = dop.fn.compare ( so1.getStringValue(), so2.getStringValue() );
 			sores = (ScalarObject) new BooleanObject(rval); 
 		}
-		else throw new DMLRuntimeException("compare(): Invalid combination of value types.");
+		else if ( so1 instanceof IntObject && so2 instanceof DoubleObject) {
+			boolean rval = dop.fn.compare ( so1.getLongValue(), so2.getDoubleValue() );
+			sores = (ScalarObject) new BooleanObject(rval); 
+		}
+		else if ( so1 instanceof DoubleObject && so2 instanceof IntObject ) {
+			boolean rval = dop.fn.compare ( so1.getDoubleValue(), so2.getLongValue() );
+			sores = (ScalarObject) new BooleanObject(rval); 
+		}
+		else throw new DMLRuntimeException("compare(): Invalid combination of value types "
+				+ "(" + so1.getValueType() + ", " + so2.getValueType() + ").");
 		
 		ec.setScalarOutput(output.getName(), sores);
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 8ca00b2..86a891e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -30,6 +30,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 
 //FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific
+
+@SuppressWarnings("rawtypes")
 public abstract class GPUContext {
 
 	public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 9708fe8..215b38c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -106,6 +106,7 @@ public abstract class GPUObject
 	 * @param GPUSize Desired size to be freed up on the GPU
 	 * @throws DMLRuntimeException If no blocks to free up or if not enough blocks with zero locks on them.	 
 	 */
+	@SuppressWarnings("rawtypes")
 	protected static void evict(final long GPUSize) throws DMLRuntimeException {
 		synchronized (GPUContext.syncObj) {
 			// Check for the completion of asynchronous cudaFree calls

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index 24063b5..b9c9161 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -1004,6 +1004,7 @@ public class JCudaObject extends GPUObject {
 	 * @param lda		rows in input matrix
 	 * @param ldc		columns in output matrix
 	 * @return			transposed matrix
+	 * @throws DMLRuntimeException if operation failed
 	 */
 	public static Pointer transpose(Pointer densePtr, int m, int n, int lda, int ldc) throws DMLRuntimeException {
 		Pointer alpha = LibMatrixCUDA.pointerTo(1.0);
@@ -1146,6 +1147,7 @@ public class JCudaObject extends GPUObject {
 	 * @param toFree {@link Pointer} instance to be freed
 	 * @param synchronous true if to be done synchronously
 	 */
+	@SuppressWarnings("rawtypes")
 	public static void cudaFreeHelper(final Pointer toFree, boolean synchronous) {
 		if (synchronous) {
 			cudaFree(toFree);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 923e618..65f3be1 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -2905,7 +2905,8 @@ public class LibMatrixMult
 		return val; 
 	}
 
-	private static double dotProduct( double[] a, double[] b, int ai, int bi, final int len )
+	//note: public for use by codegen for consistency
+	public static double dotProduct( double[] a, double[] b, int ai, int bi, final int len )
 	{
 		double val = 0;
 		final int bn = len%8;
@@ -2933,7 +2934,8 @@ public class LibMatrixMult
 		return val; 
 	}
 	
-	private static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len )
+	//note: public for use by codegen for consistency
+	public static double dotProduct( double[] a, double[] b, int[] aix, int ai, final int bi, final int len )
 	{
 		double val = 0;
 		final int bn = len%8;
@@ -2962,7 +2964,8 @@ public class LibMatrixMult
 		return val; 
 	}
 
-	private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
+	//note: public for use by codegen for consistency
+	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
 	{
 		final int bn = len%8;
 		
@@ -3089,7 +3092,8 @@ public class LibMatrixMult
 		}
 	}
 
-	private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len )
+	//note: public for use by codegen for consistency
+	public static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len )
 	{
 		final int bn = len%8;
 		
@@ -3115,7 +3119,8 @@ public class LibMatrixMult
 		}
 	}
 
-	private static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
+	//note: public for use by codegen for consistency
+	public static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
 	{
 		final int bn = len%8;
 		
@@ -3191,7 +3196,8 @@ public class LibMatrixMult
 		}
 	}
 
-	private static void vectAdd( double[] a, double[] c, int ai, int ci, final int len )
+	//note: public for use by codegen for consistency
+	public static void vectAdd( double[] a, double[] c, int ai, int ci, final int len )
 	{
 		final int bn = len%8;
 		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/util/LocalFileUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/LocalFileUtils.java b/src/main/java/org/apache/sysml/runtime/util/LocalFileUtils.java
index 0086f8f..ec145cb 100644
--- a/src/main/java/org/apache/sysml/runtime/util/LocalFileUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/LocalFileUtils.java
@@ -25,8 +25,10 @@ import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Writer;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
@@ -63,6 +65,7 @@ public class LocalFileUtils
 	public static final String CATEGORY_PARTITIONING = "partitioning";
 	public static final String CATEGORY_RESULTMERGE  = "resultmerge";
 	public static final String CATEGORY_WORK         = "work";
+	public static final String CATEGORY_CODEGEN      = "codegen";
 	
 	static {
 		_seq = new IDSequence();
@@ -463,4 +466,25 @@ public class LocalFileUtils
 		
 		return ret;
 	}
+	
+	/**
+	 * Writes a simple text file to local file system.
+	 * 
+	 * @param file output file
+	 * @param text content of text file 
+	 * @throws IOException
+	 */
+	public static void writeTextFile( File file, String text ) 
+		throws IOException 
+	{
+		Writer writer = null;
+		try {
+			writer = new FileWriter(file);
+			writer.write(text);
+			writer.flush();
+		}
+		finally {
+			IOUtilFunctions.closeSilently(writer);
+		}
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java
index 87cb64f..08f0452 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -31,6 +31,8 @@ import java.util.Set;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.controlprogram.caching.CacheStatistics;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
@@ -72,6 +74,15 @@ public class Statistics
 	private static AtomicLong hopRecompilePred = new AtomicLong(0); //count
 	private static AtomicLong hopRecompileSB = new AtomicLong(0);   //count
 
+	//CODEGEN
+	private static AtomicLong codegenCompileTime = new AtomicLong(0); //in nano
+	private static AtomicLong codegenClassCompileTime = new AtomicLong(0); //in nano
+	private static AtomicLong codegenHopCompile = new AtomicLong(0); //count
+	private static AtomicLong codegenCPlanCompile = new AtomicLong(0); //count
+	private static AtomicLong codegenClassCompile = new AtomicLong(0); //count
+	private static AtomicLong codegenPlanCacheHits = new AtomicLong(0); //count
+	private static AtomicLong codegenPlanCacheTotal = new AtomicLong(0); //count
+	
 	//Function recompile stats 
 	private static AtomicLong funRecompileTime = new AtomicLong(0); //in nano sec
 	private static AtomicLong funRecompiles = new AtomicLong(0); //count
@@ -277,6 +288,62 @@ public class Statistics
 		//note: not synchronized due to use of atomics
 		hopRecompileSB.addAndGet(delta);
 	}
+	
+	public static void incrementCodegenDAGCompile() {
+		codegenHopCompile.incrementAndGet();
+	}
+	
+	public static void incrementCodegenCPlanCompile(long delta) {
+		codegenCPlanCompile.addAndGet(delta);
+	}
+	
+	public static void incrementCodegenClassCompile() {
+		codegenClassCompile.incrementAndGet();
+	}
+	
+	public static void incrementCodegenCompileTime(long delta) {
+		codegenCompileTime.addAndGet(delta);
+	}
+	
+	public static void incrementCodegenClassCompileTime(long delta) {
+		codegenClassCompileTime.addAndGet(delta);
+	}
+	
+	public static void incrementCodegenPlanCacheHits() {
+		codegenPlanCacheHits.incrementAndGet();
+	}
+	
+	public static void incrementCodegenPlanCacheTotal() {
+		codegenPlanCacheTotal.incrementAndGet();
+	}
+	
+	public static long getCodegenDAGCompile() {
+		return codegenHopCompile.get();
+	}
+	
+	public static long getCodegenCPlanCompile() {
+		return codegenCPlanCompile.get();
+	}
+	
+	public static long getCodegenClassCompile() {
+		return codegenClassCompile.get();
+	}
+	
+	public static long getCodegenCompileTime() {
+		return codegenCompileTime.get();
+	}
+	
+	public static long getCodegenClassCompileTime() {
+		return codegenClassCompileTime.get();
+	}
+	
+	public static long getCodegenPlanCacheHits() {
+		return codegenPlanCacheHits.get();
+	}
+	
+	public static long getCodegenPlanCacheTotal() {
+		return codegenPlanCacheTotal.get();
+	}
 
 	public static void incrementFunRecompileTime( long delta ) {
 		//note: not synchronized due to use of atomics
@@ -657,6 +724,12 @@ public class Statistics
 				sb.append("Functions recompiled:\t\t" + getFunRecompiles() + ".\n");
 				sb.append("Functions recompile time:\t" + String.format("%.3f", ((double)getFunRecompileTime())/1000000000) + " sec.\n");	
 			}
+			if( ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.CODEGEN) ) {
+				sb.append("Codegen compile (DAG, CP, JC):\t" + getCodegenDAGCompile() + "/" + getCodegenCPlanCompile() + "/" + getCodegenClassCompile() + ".\n");
+				sb.append("Codegen compile times (DAG,JC):\t" + String.format("%.3f", (double)getCodegenCompileTime()/1000000000) + "/" + 
+						String.format("%.3f", (double)getCodegenClassCompileTime()/1000000000)  + " sec.\n");
+				sb.append("Codegen plan cache hits:\t" + getCodegenPlanCacheHits() + "/" + getCodegenPlanCacheTotal() + ".\n");
+			}
 			if( OptimizerUtils.isSparkExecutionMode() ){
 				String lazy = SparkExecutionContext.isLazySparkContextCreation() ? "(lazy)" : "(eager)";
 				sb.append("Spark ctx create time "+lazy+":\t"+

[8/9] incubator-systemml git commit: [SYSTEMML-1286] Code generator compiler integration, incl tests

Posted by mb...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/OuterProdTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/OuterProdTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/OuterProdTmplTest.java
new file mode 100644
index 0000000..ad3575e
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/OuterProdTmplTest.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class OuterProdTmplTest extends AutomatedTestBase 
+{	
+	private static final String TEST_NAME1 = "wdivmm";
+	private static final String TEST_NAME2 = "wdivmmRight";
+	private static final String TEST_NAME3 = "wsigmoid";
+	private static final String TEST_NAME4 = "wcemm";
+	private static final String TEST_NAME5 = "wdivmmRightNotranspose";
+	private static final String TEST_NAME6 = "wdivmmbasic";
+	private static final String TEST_NAME7 = "wdivmmTransposeOut";
+
+	private static final String TEST_DIR = "functions/codegen/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + OuterProdTmplTest.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private static final double eps = Math.pow(10, -8);
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "1" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "2" }) );
+		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "3" }) );
+		addTestConfiguration( TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] { "4" }) );
+		addTestConfiguration( TEST_NAME5, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5, new String[] { "5" }) );
+		addTestConfiguration( TEST_NAME6, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME6, new String[] { "6" }) );
+		addTestConfiguration( TEST_NAME7, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME7, new String[] { "7" }) );
+	}
+		
+	@Test
+	public void testCodegenOuterProdRewrite1() {
+		testCodegenIntegrationWithInput( TEST_NAME1, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite2()  {
+		testCodegenIntegration( TEST_NAME2, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite3() {
+		testCodegenIntegration( TEST_NAME3, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite4() {
+		testCodegenIntegrationWithInput( TEST_NAME4, true, ExecType.CP  );
+	}
+
+	@Test
+	public void testCodegenOuterProdRewrite5() {
+		testCodegenIntegration( TEST_NAME5, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite6() {
+		testCodegenIntegration( TEST_NAME6, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite7() {
+		testCodegenIntegration( TEST_NAME7, true, ExecType.CP );
+	}
+
+	@Test
+	public void testCodegenOuterProd1() {
+		testCodegenIntegrationWithInput( TEST_NAME1, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProd2()  {
+		testCodegenIntegration( TEST_NAME2, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProd3() {
+		testCodegenIntegration( TEST_NAME3, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProd4() {
+		testCodegenIntegrationWithInput( TEST_NAME4, false, ExecType.CP  );
+	}
+
+	@Test
+	public void testCodegenOuterProd5() {
+		testCodegenIntegration( TEST_NAME5, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProd6() {
+		testCodegenIntegration( TEST_NAME6, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenOuterProd7() {
+		testCodegenIntegration( TEST_NAME7, false, ExecType.CP );
+	}
+	
+	//TODO
+	
+	@Test
+	public void testCodegenOuterProdRewrite1_sp() {
+		testCodegenIntegrationWithInput( TEST_NAME1, true, ExecType.SPARK  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite2_sp() {
+		testCodegenIntegration( TEST_NAME2, true, ExecType.SPARK  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite3_sp() {
+		testCodegenIntegration( TEST_NAME3, true, ExecType.SPARK  );
+	}
+	
+	@Test
+	public void testCodegenOuterProdRewrite4_sp() {
+		testCodegenIntegrationWithInput( TEST_NAME4, true, ExecType.SPARK  );
+	}
+
+	
+	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType  )
+	{	
+		
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		switch( instType ){
+		case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+		case SPARK: 
+			rtplatform = RUNTIME_PLATFORM.SPARK;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true; 
+			break;
+		default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+	
+		}
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "-stats", 
+					"-config=" + HOME + TEST_CONF, "-args", output("S")};
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(inputDir(), expectedDir());			
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("S");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("S");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			if( !rewrites )
+				Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+		
+	}	
+
+	private void testCodegenIntegrationWithInput( String testname, boolean rewrites, ExecType instType )
+	{		
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: 
+				rtplatform = RUNTIME_PLATFORM.SPARK;
+				DMLScript.USE_LOCAL_SPARK_CONFIG = true; 
+				break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			//generate actual dataset 
+			double[][] A = getRandomMatrix(2000, 2000, -0.05, 1, 0.1, 6); 
+			writeInputMatrixWithMTD("A", A, true);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "-stats", 
+				"-config=" + HOME + TEST_CONF, "-args", output("S"), input("A")};
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(inputDir(), expectedDir());			
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			if(testname.equals(TEST_NAME4)) //wcemm
+			{
+				//compare scalars 
+				HashMap<CellIndex, Double> dmlfile = readDMLScalarFromHDFS("S");
+				HashMap<CellIndex, Double> rfile  = readRScalarFromFS("S");
+				TestUtils.compareScalars((Double) dmlfile.values().toArray()[0], (Double) rfile.values().toArray()[0],0.0001);
+			}
+			else
+			{
+				//compare matrices 
+				HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("S");
+				HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("S");
+				TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+				if( !rewrites )
+					Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+			}
+		}
+		finally {
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}	
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
new file mode 100644
index 0000000..a62847c
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class RowAggTmplTest extends AutomatedTestBase 
+{
+	private static final String TEST_NAME1 = "rowAggPattern1";
+	private static final String TEST_NAME2 = "rowAggPattern2";
+	private static final String TEST_NAME3 = "rowAggPattern3";
+	private static final String TEST_NAME4 = "rowAggPattern4";
+
+	private static final String TEST_DIR = "functions/codegen/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + RowAggTmplTest.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private static final double eps = Math.pow(10, -10);
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "0" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "1" }) );
+		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "2" }) );
+		addTestConfiguration( TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] { "3" }) );
+	}
+	
+	@Test	
+	public void testCodegenRowAggRewrite1() {
+		testCodegenIntegration( TEST_NAME1, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAggRewrite2() {
+		testCodegenIntegration( TEST_NAME2, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAggRewrite3() {
+		testCodegenIntegration( TEST_NAME3, true, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAggRewrite4() {
+		testCodegenIntegration( TEST_NAME4, true, ExecType.CP );	
+	}
+	
+	@Test	
+	public void testCodegenRowAgg1() {
+		testCodegenIntegration( TEST_NAME1, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg2() {
+		testCodegenIntegration( TEST_NAME2, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg3() {
+		testCodegenIntegration( TEST_NAME3, false, ExecType.CP );
+	}
+	
+	@Test
+	public void testCodegenRowAgg4() {
+		testCodegenIntegration( TEST_NAME4, false, ExecType.CP );	
+	}
+	
+	
+	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
+	{	
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM oldPlatform = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: 
+				rtplatform = RUNTIME_PLATFORM.SPARK;
+				DMLScript.USE_LOCAL_SPARK_CONFIG = true; 
+				break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "runtime", "-stats", 
+					"-config=" + HOME + TEST_CONF, "-args", output("S") };
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(inputDir(), expectedDir());			
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("S");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("S");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+			rtplatform = oldPlatform;
+		}
+	}	
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_GLM.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_GLM.R b/src/test/scripts/functions/codegen/Algorithm_GLM.R
new file mode 100644
index 0000000..a1fd302
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_GLM.R
@@ -0,0 +1,1081 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+
+
+
+
+check_if_supported <- 
+    function (ncol_y, dist_type, var_power, link_type, link_power)
+{
+    is_supported = 0;
+    if (ncol_y == 1 & dist_type == 1 & link_type == 1)
+    { # POWER DISTRIBUTION
+        is_supported = 1;
+        if (var_power == 0.0 & link_power == -1.0) {print ("Gaussian.inverse");      } else {
+        if (var_power == 0.0 & link_power ==  0.0) {print ("Gaussian.log");          } else {
+        if (var_power == 0.0 & link_power ==  0.5) {print ("Gaussian.sqrt");         } else {
+        if (var_power == 0.0 & link_power ==  1.0) {print ("Gaussian.id");           } else {
+        if (var_power == 0.0                     ) {print ("Gaussian.power_nonlog"); } else {
+        if (var_power == 1.0 & link_power == -1.0) {print ("Poisson.inverse");       } else {
+        if (var_power == 1.0 & link_power ==  0.0) {print ("Poisson.log");           } else {
+        if (var_power == 1.0 & link_power ==  0.5) {print ("Poisson.sqrt");          } else {
+        if (var_power == 1.0 & link_power ==  1.0) {print ("Poisson.id");            } else {
+        if (var_power == 1.0                     ) {print ("Poisson.power_nonlog");  } else {
+        if (var_power == 2.0 & link_power == -1.0) {print ("Gamma.inverse");         } else {
+        if (var_power == 2.0 & link_power ==  0.0) {print ("Gamma.log");             } else {
+        if (var_power == 2.0 & link_power ==  0.5) {print ("Gamma.sqrt");            } else {
+        if (var_power == 2.0 & link_power ==  1.0) {print ("Gamma.id");              } else {
+        if (var_power == 2.0                     ) {print ("Gamma.power_nonlog");    } else {
+        if (var_power == 3.0 & link_power == -2.0) {print ("InvGaussian.1/mu^2");    } else {
+        if (var_power == 3.0 & link_power == -1.0) {print ("InvGaussian.inverse");   } else {
+        if (var_power == 3.0 & link_power ==  0.0) {print ("InvGaussian.log");       } else {
+        if (var_power == 3.0 & link_power ==  0.5) {print ("InvGaussian.sqrt");      } else {
+        if (var_power == 3.0 & link_power ==  1.0) {print ("InvGaussian.id");        } else {
+        if (var_power == 3.0                     ) {print ("InvGaussian.power_nonlog");}else{
+        if (                   link_power ==  0.0) {print ("PowerDist.log");         } else {
+                                                    print ("PowerDist.power_nonlog");
+    }   }}}}} }}}}} }}}}} }}}}} }}
+    if (ncol_y == 1 & dist_type == 2)
+    {
+        print ("Error: Bernoulli response matrix has not been converted into two-column format.");
+    }
+    if (ncol_y == 2 & dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        is_supported = 1;
+        if (link_type == 1 & link_power == -1.0) {print ("Binomial.inverse");        } else {
+        if (link_type == 1 & link_power ==  0.0) {print ("Binomial.log");            } else {
+        if (link_type == 1 & link_power ==  0.5) {print ("Binomial.sqrt");           } else {
+        if (link_type == 1 & link_power ==  1.0) {print ("Binomial.id");             } else {
+        if (link_type == 1)                      {print ("Binomial.power_nonlog");   } else {
+        if (link_type == 2)                      {print ("Binomial.logit");          } else {
+        if (link_type == 3)                      {print ("Binomial.probit");         } else {
+        if (link_type == 4)                      {print ("Binomial.cloglog");        } else {
+        if (link_type == 5)                      {print ("Binomial.cauchit");        }
+    }   }}}}} }}}
+    if (is_supported == 0) {
+        print ("Response matrix with " + ncol_y + " columns, distribution family (" + dist_type + ", " + var_power
+             + ") and link family (" + link_type + ", " + link_power + ") are NOT supported together.");
+    }
+    
+    return (is_supported)
+}
+
+glm_initialize <- function (X, Y, dist_type, var_power, link_type, link_power, icept_status, max_iter_CG)
+{
+    saturated_log_l = 0.0;
+    isNaN = 0;
+    y_corr = Y [, 1];
+    if (dist_type == 2) {
+        n_corr = rowSums (Y);
+        is_n_zero = (n_corr == 0.0);
+        y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
+    }
+    linear_terms = y_corr;
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+        if          (link_power ==  0.0) {
+            if (sum (y_corr < 0.0) == 0) {
+                is_zero_y_corr = (y_corr == 0.0);
+                linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_power ==  1.0) {
+            linear_terms = y_corr;
+        } else { if (link_power == -1.0) {
+            linear_terms = 1.0 / y_corr;
+        } else { if (link_power ==  0.5) {
+            if (sum (y_corr < 0.0) == 0) {
+                linear_terms = sqrt (y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_power >   0.0) {
+            if (sum ((y_corr < 0.0)) == 0) {
+                is_zero_y_corr = (y_corr == 0.0);
+                linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+            } else { isNaN = 1; }
+        } else {
+            if (sum ((y_corr <= 0.0)) == 0) {
+                linear_terms = y_corr ^ link_power;
+            } else { isNaN = 1; }
+        }}}}}
+    }
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        if          (link_type == 1 & link_power == 0.0)  { # Binomial.log
+            if (sum ((y_corr < 0.0)) == 0) {
+                is_zero_y_corr = (y_corr == 0.0);
+                linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_type == 1 & link_power >  0.0)  { # Binomial.power_nonlog pos
+            if (sum ((y_corr < 0.0)) == 0) {
+                is_zero_y_corr = (y_corr == 0.0);
+                linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+            } else { isNaN = 1; }
+        } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
+            if (sum ((y_corr <= 0.0)) == 0) {
+                linear_terms = y_corr ^ link_power;
+            } else { isNaN = 1; }
+        } else { 
+            is_zero_y_corr = (y_corr <= 0.0);
+            is_one_y_corr  = (y_corr >= 1.0);
+            y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
+            if (link_type == 2)                           { # Binomial.logit
+                linear_terms = log (y_corr / (1.0 - y_corr)) 
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 3)                  { # Binomial.probit
+                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
+                t = sqrt (- 2.0 * log (y_below_half));
+                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
+                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 4)                  { # Binomial.cloglog
+                linear_terms = log (- log (1.0 - y_corr))
+                    - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 5)                  { # Binomial.cauchit
+                linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795)
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+        }}  }}}}}
+    }
+    
+    if (isNaN == 0) {
+        tmp1 = glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
+        saturated_log_l = tmp1[1];
+        isNaN = tmp1[2];
+    }
+    
+    if ((dist_type == 1 & link_type == 1 & link_power == 0.0) |
+        (dist_type == 2 & link_type >= 2))
+    {    
+        desired_eta = 0.0;
+    } else { if (link_type == 1 & link_power == 0.0) {
+        desired_eta = log (0.5);
+    } else { if (link_type == 1) {
+        desired_eta = 0.5 ^ link_power;
+    } else {
+        desired_eta = 0.5;
+    }}}
+    
+    beta = matrix (0.0, ncol(X), 1);
+    
+    if (desired_eta != 0.0) {
+        if (icept_status == 1 | icept_status == 2) {
+            beta [nrow(beta), 1] = desired_eta;
+        } else {
+            # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
+            # Note that "ssX_transform" is trivial here, hence ignored
+            
+            beta = straightenX (X, 0.000001, max_iter_CG);  
+            beta = beta * desired_eta;
+}   }   
+
+  return (c(beta, saturated_log_l, isNaN))
+}
+
+
+glm_dist <- function (linear_terms, Y,
+                    dist_type, var_power, link_type, link_power)
+{
+    num_records = nrow (linear_terms);
+    zeros_r = matrix (0.0, num_records, 1);
+    ones_r = 1 + zeros_r;
+    g_Y  = zeros_r;
+    w  = zeros_r;
+
+    # Some constants
+
+    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
+    ones_2 = matrix (1.0, 1, 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] = 0.0;
+    flip_pos = matrix (0, 2, 2);
+    flip_neg = flip_pos;
+    flip_pos [1, 2] = 1;
+    flip_pos [2, 1] = 1;
+    flip_neg [1, 2] = -1;
+    flip_neg [2, 1] = 1;
+    
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+        y_mean = zeros_r;
+        if          (link_power ==  0.0) {
+            y_mean = exp (linear_terms);
+            y_mean_pow = y_mean ^ (1 - var_power);
+            w   = y_mean_pow * y_mean;
+            g_Y = y_mean_pow * (Y - y_mean);
+        } else { if (link_power ==  1.0) {
+            y_mean = linear_terms;
+            w   = y_mean ^ (- var_power);
+            g_Y = w * (Y - y_mean);
+        } else {
+            y_mean = linear_terms ^ (1.0 / link_power);
+            c1  = (1 - var_power) / link_power - 1;
+            c2  = (2 - var_power) / link_power - 2;
+            g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
+            w   = (linear_terms ^ c2) / (link_power ^ 2);
+    }   }}
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        if (link_type == 1) { # BINOMIAL.POWER LINKS
+            if (link_power == 0.0)  { # Binomial.log
+                vec1 = 1 / (exp (- linear_terms) - 1);
+                g_Y = Y [, 1] - Y [, 2] * vec1;
+                w   = rowSums (Y) * vec1;
+            } else {                  # Binomial.nonlog
+                vec1 = zeros_r;
+                if (link_power == 0.5)  {
+                    vec1 = 1 / (1 - linear_terms ^ 2);
+                } else { if (sum ((linear_terms < 0.0)) == 0) {
+                    vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
+                } else {isNaN = 1;}}
+                # We want a "zero-protected" version of
+                #     vec2 = Y [, 1] / linear_terms;
+                is_y_0 = (Y [, 1] == 0.0);
+                vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
+                g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
+                w   =  rowSums (Y) * vec1 / link_power ^ 2;
+            }
+        } else {
+            is_LT_pos_infinite = (linear_terms ==  1.0/0.0);
+            is_LT_neg_infinite = (linear_terms == -1.0/0.0);
+            is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+            finite_linear_terms = replace (target =        linear_terms, pattern =  1.0/0.0, replacement = 0);
+            finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
+            if (link_type == 2)                           { # Binomial.logit
+                Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+                Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
+                w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
+            } else { if (link_type == 3)                  { # Binomial.probit
+                is_lt_pos = (linear_terms > 0.0);
+                t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+                pt_gp = t_gp * ( 0.254829592 
+                      + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                      + t_gp * (-1.453152027 
+                      + t_gp *   1.061405429))));
+                the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
+                vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
+                vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
+                w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
+                g_Y = one_over_sqrt_two_pi * vec2 / vec1;
+            } else { if (link_type == 4)                  { # Binomial.cloglog
+                the_exp = exp (linear_terms)
+                the_exp_exp = exp (- the_exp);
+                is_too_small = ((10000000 + the_exp) == 10000000);
+                the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
+                g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
+                w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
+            } else { if (link_type == 5)                  { # Binomial.cauchit
+                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
+                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
+                link_gradient_normalized = (1 + linear_terms ^ 2) * 3.1415926535897932384626433832795;
+                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
+                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
+            }}}}   
+        }
+    }
+    
+    return (c(g_Y, w))
+}
+
+
+glm_log_likelihood_part <- function (linear_terms, Y,
+        dist_type, var_power, link_type, link_power)
+{
+    isNaN = 0;
+    log_l = 0.0;
+    num_records = nrow (Y);
+    zeros_r = matrix (0.0, num_records, 1);
+    
+    if (dist_type == 1 & link_type == 1)
+    { # POWER DISTRIBUTION
+        b_cumulant = zeros_r;
+        natural_parameters = zeros_r;
+        is_natural_parameter_log_zero = zeros_r;
+        if          (var_power == 1.0 & link_power == 0.0)  { # Poisson.log
+            b_cumulant = exp (linear_terms);
+            is_natural_parameter_log_zero = (linear_terms == (-1.0/0.0));
+            natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0);
+        } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
+            if (sum ((linear_terms < 0.0)) == 0)  {
+                b_cumulant = linear_terms;
+                is_natural_parameter_log_zero = (linear_terms == 0.0);
+                natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
+            if (sum ((linear_terms <0.0)) == 0)  {
+                b_cumulant = linear_terms ^ 2;
+                is_natural_parameter_log_zero = (linear_terms == 0.0);
+                natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0 & link_power  > 0.0)  { # Poisson.power_nonlog, pos
+            if (sum ((linear_terms <0.0)) == 0)  {
+                is_natural_parameter_log_zero = (linear_terms == 0.0);
+                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
+                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
+            if (sum ((linear_terms <= 0.0)) == 0) {
+                b_cumulant = linear_terms ^ (1.0 / link_power);
+                natural_parameters = log (linear_terms) / link_power;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
+            if (sum ((linear_terms <= 0.0)) == 0) {
+                b_cumulant = - log (linear_terms);
+                natural_parameters = - linear_terms;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
+            if (sum ((linear_terms <= 0.0)) == 0) {
+                b_cumulant = log (linear_terms);
+                natural_parameters = - 1.0 / linear_terms;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power ==  0.0) { # Gamma.log
+            b_cumulant = linear_terms;
+            natural_parameters = - exp (- linear_terms);
+        } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
+            if (sum ((linear_terms <= 0.0)) == 0) {
+                b_cumulant = log (linear_terms) / link_power;
+                natural_parameters = - linear_terms ^ (- 1.0 / link_power);
+            } else {isNaN = 1;}
+        } else { if                    (link_power ==  0.0) { # PowerDist.log
+            natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
+            b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
+        } else {                                              # PowerDist.power_nonlog
+            if          (-2 * link_power == 1.0 - var_power) {
+                natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
+            } else { if (-1 * link_power == 1.0 - var_power) {
+                natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
+            } else { if (     link_power == 1.0 - var_power) {
+                natural_parameters = linear_terms / (1.0 - var_power);
+            } else { if ( 2 * link_power == 1.0 - var_power) {
+                natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
+            } else {
+                if (sum ((linear_terms <=0.0)) == 0) {
+                    power = (1.0 - var_power) / link_power;
+                    natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
+                } else {isNaN = 1;}
+            }}}}
+            if          (-2 * link_power == 2.0 - var_power) {
+                b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
+            } else { if (-1 * link_power == 2.0 - var_power) {
+                b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
+            } else { if (     link_power == 2.0 - var_power) {
+                b_cumulant = linear_terms / (2.0 - var_power);
+            } else { if ( 2 * link_power == 2.0 - var_power) {
+                b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
+            } else {
+                if (sum ((linear_terms<= 0.0)) == 0) {
+                    power = (2.0 - var_power) / link_power;
+                    b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
+                } else {isNaN = 1;}
+            }}}}
+        }}}}} }}}}}
+        if (sum (is_natural_parameter_log_zero * abs (Y)) > 0.0) {
+            log_l = -1.0 / 0.0;
+            isNaN = 1;
+        }
+        if (isNaN == 0)
+        {
+            log_l = sum (Y * natural_parameters - b_cumulant);
+            if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                log_l = -1.0 / 0.0;
+                isNaN = 1;
+    }   }   }
+    
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+    
+        tmp7 = binomial_probability_two_column (linear_terms, link_type, link_power);
+        Y_prob = tmp7[1];
+        isNaN = tmp7[2]
+        
+        if (isNaN == 0) {            
+            does_prob_contradict = (Y_prob <= 0.0);
+            if (sum (does_prob_contradict * abs (Y)) == 0.0) {
+                log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
+                if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                    isNaN = 1;
+                }
+            } else {
+                log_l = -1.0 / 0.0;
+                isNaN = 1;
+    }   }   }
+    
+    if (isNaN == 1) {
+        log_l = - 1.0 / 0.0; 
+    }
+}
+
+
+
+binomial_probability_two_column <- function (linear_terms, link_type, link_power)
+{
+    isNaN = 0;
+    num_records = nrow (linear_terms);
+
+    # Define some auxiliary matrices
+
+    ones_2 = matrix (1.0, 1, 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] = 0.0;
+
+    zeros_r = matrix (0.0, num_records, 1);
+    ones_r = 1.0 + zeros_r;
+
+    # Begin the function body
+
+    Y_prob = zeros_r %*% ones_2;
+    if (link_type == 1) { # Binomial.power
+        if          (link_power == 0.0) { # Binomial.log
+            Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
+        } else { if (link_power == 0.5) { # Binomial.sqrt
+            Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
+        } else {                          # Binomial.power_nonlog
+            if (sum ((linear_terms < 0.0)) == 0) {
+                Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
+            } else {isNaN = 1;}
+        }}
+    } else {              # Binomial.non_power
+        is_LT_pos_infinite = (linear_terms ==  (1.0/0.0));
+        is_LT_neg_infinite = (linear_terms == (-1.0/0.0));
+        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+        finite_linear_terms = replace (target =        linear_terms, pattern =  1.0/0.0, replacement = 0);
+        finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
+        if (link_type == 2)             { # Binomial.logit
+            Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+            Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+        } else { if (link_type == 3)    { # Binomial.probit
+            lt_pos_neg = (finite_linear_terms >= 0.0) %*% p_one_m_one + ones_r %*% zero_one;
+            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+            pt_gp = t_gp * ( 0.254829592 
+                  + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                  + t_gp * (-1.453152027 
+                  + t_gp *   1.061405429))));
+            the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
+            Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
+        } else { if (link_type == 4)    { # Binomial.cloglog
+            the_exp = exp (finite_linear_terms);
+            the_exp_exp = exp (- the_exp);
+            is_too_small = ((10000000 + the_exp)== 10000000);
+            Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
+            Y_prob [, 2] = the_exp_exp;
+        } else { if (link_type == 5)    { # Binomial.cauchit
+            Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
+        } else {
+            isNaN = 1;
+        }}}}
+        Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+}   
+
+   return (c(Y_prob, isNaN));
+}            
+
+
+# THE CG-STEIHAUG PROCEDURE SCRIPT
+
+# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
+# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
+# under constraint:  ||z|| <= trust_delta.
+# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
+# is given separately because sparse "X" may become dense after applying the transform.
+#
+get_CG_Steihaug_point <-
+    function (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iter_CG)
+{
+    trust_delta_sq = trust_delta ^ 2;
+    size_CG = nrow (g);
+    z = matrix (0.0, size_CG, 1);
+    neg_log_l_change = 0.0;
+    reached_trust_boundary = 0;
+    g_reg = g + lambda * beta;
+    r_CG = g_reg;
+    p_CG = -r_CG;
+    rr_CG = sum(r_CG * r_CG);
+    eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
+    converged_CG = 0;
+    if (rr_CG < eps_CG) {
+        converged_CG = 1;
+    }
+    
+    max_iteration_CG = max_iter_CG;
+    if (max_iteration_CG <= 0) {
+        max_iteration_CG = size_CG;
+    }
+    i_CG = 0;
+    while (converged_CG == 0)
+    {
+        i_CG = i_CG + 1;
+        ssX_p_CG = diag (scale_X) %*% p_CG;
+        ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
+        temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
+        q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
+        pq_CG = sum (p_CG * q_CG);
+        if (pq_CG <= 0) {
+            pp_CG = sum (p_CG * p_CG);  
+            if (pp_CG > 0) {
+                tmp6 = get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                z = tmp6[1];
+                neg_log_l_change= tmp6[2];
+                reached_trust_boundary = 1;
+            } else {
+                neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+            }
+            converged_CG = 1;
+        }
+        if (converged_CG == 0) {
+            alpha_CG = rr_CG / pq_CG;
+            new_z = z + alpha_CG * p_CG;
+            if (sum(new_z * new_z) >= trust_delta_sq) {
+                pp_CG = sum (p_CG * p_CG);  
+                tmp8 = get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                z = tmp8[1];
+                neg_log_l_change = tmp8[2]
+                reached_trust_boundary = 1;
+                converged_CG = 1;
+            }
+            if (converged_CG == 0) {
+                z = new_z;
+                old_rr_CG = rr_CG;
+                r_CG = r_CG + alpha_CG * q_CG;
+                rr_CG = sum(r_CG * r_CG);
+                if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
+                    neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+                    reached_trust_boundary = 0;
+                    converged_CG = 1;
+                }
+                if (converged_CG == 0) {
+                    p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
+}   }   }   }   
+
+  return (c(z, neg_log_l_change, i_CG, reached_trust_boundary));
+}
+
+
+# An auxiliary function used twice inside the CG-STEIHAUG loop:
+get_trust_boundary_point <- 
+    function (g, z, p, q, r, pp, pq, trust_delta_sq)
+{
+    zz = sum (z * z);  pz = sum (p * z);
+    sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
+    tau_1 = (- pz + sq_root_d) / pp;
+    tau_2 = (- pz - sq_root_d) / pp;
+    zq = sum (z * q);  gp = sum (g * p);
+    f_extra = 0.5 * sum (z * (r + g));
+    f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
+    f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
+    if (f_change_1 < f_change_2) {
+        new_z = z + (tau_1 * p);
+        f_change = f_change_1;
+    }
+    else {
+        new_z = z + (tau_2 * p);
+        f_change = f_change_2;
+    }
+    
+    return (c(new_z, f_change))
+}
+
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX <- function (X, eps, max_iter_CG)
+{
+    w_X = t(t(colSums(X)));
+    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+    eps_LS = eps * nrow(X);
+
+    # BEGIN LEAST SQUARES
+    
+    r_LS = - w_X;
+    z_LS = matrix (0.0, ncol(X), 1);
+    p_LS = - r_LS;
+    norm_r2_LS = sum (r_LS ^ 2);
+    i_LS = 0;
+    while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
+    {
+        q_LS = t(X) %*% X %*% p_LS;
+        q_LS = q_LS + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+    }
+    
+    # END LEAST SQUARES
+    
+    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+    return(w);
+}
+
+
+round_to_print <- function (x_to_truncate)
+{
+    mantissa = 1.0;
+    eee = 0;
+    positive_infinity = 1.0 / 0.0;
+    x = abs (x_to_truncate);
+    if (x != x / 2.0) {
+        log_ten = log (10.0);
+        d_eee = round (log (x) / log_ten - 0.5);
+        mantissa = round (x * exp (log_ten * (4.0 - d_eee))) / 10000;
+        if (mantissa == 10.0) {
+            mantissa = 1.0;
+            d_eee = d_eee + 1;
+        }
+        if (x_to_truncate < 0.0) {
+            mantissa = - mantissa;
+        }
+        eee = 0;
+        pow_two = 1;
+        res_eee = abs (d_eee);
+        while (res_eee != 0.0) {
+            new_res_eee = round (res_eee / 2.0 - 0.3);
+            if (new_res_eee * 2.0 < res_eee) {
+                eee = eee + pow_two;
+            }
+            res_eee = new_res_eee;
+            pow_two = 2 * pow_two;
+        }
+        if (d_eee < 0.0) {
+            eee = - eee;
+        }
+    } else { mantissa = x_to_truncate; }
+    
+    return (c(mantissa, eee));
+}
+
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+Y = readMM(paste(args[1], "Y.mtx", sep=""));
+
+fileO = " ";
+fileLog = " ";
+
+intercept_status = as.integer(args[2]);
+eps = as.double(args[3]);
+max_iteration_IRLS = as.integer(args[4]);
+max_iteration_CG = as.integer(args[4]);
+
+distribution_type = as.integer(args[5]);
+variance_as_power_of_the_mean = as.double(args[6]);
+link_type = as.integer(args[7]); 
+
+if( distribution_type != 1 ) {
+  link_as_power_of_the_mean = as.double(args[8]);
+  bernoulli_No_label = 0.0;
+} else {
+  link_as_power_of_the_mean = 1.0;
+  bernoulli_No_label = as.double(args[8]); 
+}
+
+dispersion = 0.0;
+regularization = 0.001;
+
+
+variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
+link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
+bernoulli_No_label = as.double (bernoulli_No_label);
+dispersion = as.double (dispersion);
+eps = as.double (eps);
+
+
+# Default values for output statistics:
+
+termination_code     = 0;
+min_beta             = 0.0 / 0.0;
+i_min_beta           = 0.0 / 0.0;
+max_beta             = 0.0 / 0.0;
+i_max_beta           = 0.0 / 0.0;
+intercept_value      = 0.0 / 0.0;
+dispersion           = 0.0 / 0.0;
+estimated_dispersion = 0.0 / 0.0;
+deviance_nodisp      = 0.0 / 0.0;
+deviance             = 0.0 / 0.0;
+
+print("BEGIN GLM SCRIPT");
+
+num_records  = nrow (X);
+num_features = ncol (X);
+zeros_r = matrix (0, num_records, 1);
+ones_r = 1 + zeros_r;
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = cbind (X, ones_r);
+    num_features = ncol (X);
+}
+
+scale_lambda = matrix (1, num_features, 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [num_features, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, num_features] = ones_r
+    avg_X_cols = t(t(colSums(X))) / num_records;
+    var_X_cols = (t(t(colSums (X ^ 2))) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
+    is_unsafe = (var_X_cols <= 0.0);
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [num_features, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [num_features, 1] = 0;
+    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+    scale_X = matrix (1, num_features, 1);
+    shift_X = matrix (0, num_features, 1);
+    rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
+
+# Initialize other input-dependent parameters
+
+lambda = scale_lambda * regularization;
+if (max_iteration_CG == 0) {
+    max_iteration_CG = num_features;
+}
+
+# In Bernoulli case, convert one-column "Y" into two-column
+
+if (distribution_type == 2 & ncol(Y) == 1)
+{
+    is_Y_negative = (Y == bernoulli_No_label);
+    Y = append (1 - is_Y_negative, is_Y_negative);
+    count_Y_negative = sum (is_Y_negative);
+    if (count_Y_negative == 0) {
+        stop ("GLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label");
+    }
+    if (count_Y_negative == nrow(Y)) {
+        stop ("GLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label");
+    }
+}
+
+# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]
+
+if (link_type == 0)
+{
+    if (distribution_type == 1) {
+        link_type = 1;
+        link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
+    } else { if (distribution_type == 2) {
+            link_type = 2;
+}   }   }
+
+# For power distributions and/or links, we use two constants,
+# "variance as power of the mean" and "link_as_power_of_the_mean",
+# to specify the variance and the link as arbitrary powers of the
+# mean.  However, the variance-powers of 1.0 (Poisson family) and
+# 2.0 (Gamma family) have to be treated as special cases, because
+# these values integrate into logarithms.  The link-power of 0.0
+# is also special as it represents the logarithm link.
+
+num_response_columns = ncol (Y);
+
+is_supported = check_if_supported (num_response_columns, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+if (is_supported == 1)
+{
+
+#####   INITIALIZE THE BETAS   #####
+
+tmp2 = glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);
+beta = tmp2[1];
+saturated_log_l = tmp2[2]
+isNaN = tmp2[3];
+
+
+if (isNaN == 0)
+{
+
+#####  START OF THE MAIN PART  #####
+
+sum_X_sq = sum (rowSums_X_sq);
+trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
+###  max_trust_delta = trust_delta * 10000.0;
+log_l = 0.0;
+deviance_nodisp = 0.0;
+new_deviance_nodisp = 0.0;
+isNaN_log_l = 2;
+newbeta = beta;
+g = matrix (0.0, num_features, 1);
+g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+accept_new_beta = 1;
+reached_trust_boundary = 0;
+neg_log_l_change_predicted = 0.0;
+i_IRLS = 0;
+
+print ("BEGIN IRLS ITERATIONS...");
+
+ssX_newbeta = diag (scale_X) %*% newbeta;
+ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+all_linear_terms = X %*% ssX_newbeta;
+
+print("DEBUG1")
+tmp4 = glm_log_likelihood_part(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+new_log_l = tmp4[1];
+isNaN_new_log_l = tmp4[2];
+
+if (isNaN_new_log_l == 0) {
+    new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+    new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+}
+
+print("DEBUG2")
+
+
+# set w to avoid 'Initialization of w depends on if-else/while execution' warnings
+w = matrix (0.0, 1, 1);
+while (termination_code == 0)
+{
+    accept_new_beta = 1;
+    
+    if (i_IRLS > 0)
+    {
+        if (isNaN_log_l == 0) {
+            accept_new_beta = 0;
+        }
+
+# Decide whether to accept a new iteration point and update the trust region
+# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+
+        rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
+        if (rho < 0.25 | isNaN_new_log_l == 1) {
+            trust_delta = 0.25 * trust_delta;
+        }
+        if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
+            trust_delta = 2 * trust_delta;
+            
+### if (trust_delta > max_trust_delta) {
+###     trust_delta = max_trust_delta;
+### }
+
+        }
+        if (rho > 0.1 & isNaN_new_log_l == 0) {
+            accept_new_beta = 1;
+        }
+    }
+
+    if (fileLog != " ") {
+        log_str = append (log_str, "IS_POINT_UPDATED," + i_IRLS + "," + accept_new_beta);
+        log_str = append (log_str, "TRUST_DELTA,"      + i_IRLS + "," + trust_delta);
+    }
+    if (accept_new_beta == 1)
+    {
+        beta = newbeta;  log_l = new_log_l;  deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
+        
+        tmp3 = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+        g_Y = tmp3[1];
+        w = tmp3[2];
+        
+        # We introduced these variables to avoid roundoff errors:
+        #     g_Y = y_residual / (y_var * link_grad);
+        #     w   = 1.0 / (y_var * link_grad * link_grad);
+                      
+        gXY = - t(X) %*% g_Y;
+        g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
+        g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+        
+        if (fileLog != " ") {
+            log_str = append (log_str, "GRADIENT_NORM," + i_IRLS + "," + g_norm);
+        }
+    }
+    
+    tmp5 = get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
+    z = tmp5[1];
+    neg_log_l_change_predicted = tmp5[2];
+    num_CG_iters  = tmp5[3];
+    reached_trust_boundary = tmp5[4];
+
+
+    newbeta = beta + z;
+    
+    ssX_newbeta = diag (scale_X) %*% newbeta;
+    ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+    all_linear_terms = X %*% ssX_newbeta;
+    
+    tmp4 = glm_log_likelihood_part(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+    new_log_l = tmp4[1];
+    isNaN_new_log_l = tmp4[2];
+
+    if (isNaN_new_log_l == 0) {
+        new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+        new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+    }
+        
+    log_l_change = new_log_l - log_l;               # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
+
+    if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & 
+        (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) )  
+    {
+        termination_code = 1;
+    }
+    rho = - log_l_change / neg_log_l_change_predicted;
+    z_norm = sqrt (sum (z * z));
+    
+    tmp9 = round_to_print (z_norm);
+    z_norm_m = tmp9[1];
+    z_norm_e = tmp9[2];
+    tmp9 = round_to_print (trust_delta);
+    trust_delta_m = tmp9[1];
+    trust_delta_e = tmp9[2];
+    tmp9 = round_to_print (rho);
+    rho_m = tmp9[1];
+    rho_e = tmp9[2];
+    tmp9 = round_to_print (new_log_l);
+    new_log_l_m = tmp9[1];
+    new_log_l_e = tmp9[2]; 
+    tmp9 = round_to_print (log_l_change);
+    log_l_change_m = tmp9[1];
+    log_l_change_e = tmp9[2];
+    tmp9 = round_to_print (g_norm);
+    g_norm_m = tmp9[1];
+    g_norm_e = tmp9[2];
+
+    i_IRLS = i_IRLS + 1;
+    print ("Iter #" + i_IRLS + " completed"
+        + ", ||z|| = " + z_norm_m + "E" + z_norm_e
+        + ", trust_delta = " + trust_delta_m + "E" + trust_delta_e
+        + ", reached = " + reached_trust_boundary
+        + ", ||g|| = " + g_norm_m + "E" + g_norm_e
+        + ", new_log_l = " + new_log_l_m + "E" + new_log_l_e
+        + ", log_l_change = " + log_l_change_m + "E" + log_l_change_e
+        + ", rho = " + rho_m + "E" + rho_e);
+        
+    if (fileLog != " ") {
+        log_str = append (log_str, "NUM_CG_ITERS,"     + i_IRLS + "," + num_CG_iters);
+        log_str = append (log_str, "IS_TRUST_REACHED," + i_IRLS + "," + reached_trust_boundary);
+        log_str = append (log_str, "POINT_STEP_NORM,"  + i_IRLS + "," + z_norm);
+        log_str = append (log_str, "OBJECTIVE,"        + i_IRLS + "," + (- new_log_l));
+        log_str = append (log_str, "OBJ_DROP_REAL,"    + i_IRLS + "," + log_l_change);
+        log_str = append (log_str, "OBJ_DROP_PRED,"    + i_IRLS + "," + (- neg_log_l_change_predicted));
+        log_str = append (log_str, "OBJ_DROP_RATIO,"   + i_IRLS + "," + rho);
+        log_str = append (log_str, "LINEAR_TERM_MIN,"  + i_IRLS + "," + min (all_linear_terms));
+        log_str = append (log_str, "LINEAR_TERM_MAX,"  + i_IRLS + "," + max (all_linear_terms));
+    }
+        
+    if (i_IRLS == max_iteration_IRLS) {
+        termination_code = 2;
+    }
+}
+
+beta = newbeta;
+log_l = new_log_l;
+deviance_nodisp = new_deviance_nodisp;
+
+if (termination_code == 1) {
+    print ("Converged in " + i_IRLS + " steps.");
+} else {
+    print ("Did not converge.");
+}
+
+ssX_beta = diag (scale_X) %*% beta;
+ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
+if (intercept_status == 2) {
+    beta_out = append (ssX_beta, beta);
+} else {
+    beta_out = ssX_beta;
+}
+
+writeMM(as(w,"CsparseMatrix"), paste(args[9], "w", sep=""));
+
+if (intercept_status == 1 | intercept_status == 2) {
+    intercept_value = as.scalar (beta_out [num_features, 1]);
+    beta_noicept = beta_out [1 : (num_features - 1), 1];
+} else {
+    beta_noicept = beta_out [1 : num_features, 1];
+}
+min_beta = min (beta_noicept);
+max_beta = max (beta_noicept);
+tmp_i_min_beta = rowIndexMin (t(beta_noicept))
+i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
+tmp_i_max_beta = rowIndexMax (t(beta_noicept))
+i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);
+
+#####  OVER-DISPERSION PART  #####
+
+all_linear_terms = X %*% ssX_beta;
+tmp3 = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+g_Y = tmp3[1]
+w = tmp3[2];    
+    
+pearson_residual_sq = g_Y ^ 2 / w;
+pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0);
+# pearson_residual_sq = (y_residual ^ 2) / y_var;
+
+if (num_records > num_features) {
+    estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
+}
+if (dispersion <= 0.0) {
+    dispersion = estimated_dispersion;
+}
+deviance = deviance_nodisp / dispersion;
+
+#####  END OF THE MAIN PART  #####
+
+} else { print ("Input matrices are out of range.  Terminating the DML."); termination_code = 3; }
+} else { print ("Distribution/Link not supported.  Terminating the DML."); termination_code = 4; }
+
+str = "TERMINATION_CODE," + termination_code;
+str = append (str, "BETA_MIN," + min_beta);
+str = append (str, "BETA_MIN_INDEX," + i_min_beta);
+str = append (str, "BETA_MAX," + max_beta);
+str = append (str, "BETA_MAX_INDEX," + i_max_beta);
+str = append (str, "INTERCEPT," + intercept_value);
+str = append (str, "DISPERSION," + dispersion);
+str = append (str, "DISPERSION_EST," + estimated_dispersion);
+str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
+str = append (str, "DEVIANCE_SCALED," + deviance);
+print (str);
+
+

[2/9] incubator-systemml git commit: [SYSTEMML-1285] New basic code generator for operator fusion

Posted by mb...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java b/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
new file mode 100644
index 0000000..a4bcffe
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map.Entry;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeCell;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeRowAggVector;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.template.BaseTpl.TemplateType;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+import org.apache.sysml.utils.Statistics;
+
+public class CplanRegister {
+	
+	//HashMap: key: TemplateType - Value: List of all the patterns fused by that template 
+	//LinkedHashMap: key: HopID of the original hop to be fused , Value: Input hops to the fused operation 
+	  	//Note: LinkedHashMap holds intermediate cplans as well (e.g, log(exp(round(X))) ) We store in the LinkedHashMao three keys 
+			    //for the three hops (log, exp and round). The key that was inserted last is the key of the hop to be fused
+		
+	private HashMap<TemplateType, ArrayList<LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>>>  _cplans;
+	
+	public CplanRegister() {
+		_cplans = new HashMap<TemplateType, ArrayList<LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>>>();
+	}
+	
+	public void insertCpplans(TemplateType type, LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> cplans) {
+		if( !_cplans.containsKey(type) )
+			_cplans.put(type, new ArrayList<LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>>());
+		
+		_cplans.get(type).add(cplans);
+		
+		if( DMLScript.STATISTICS )
+			Statistics.incrementCodegenCPlanCompile(1); 
+		//note: cplans.size() would also contain all subsets of cpplans
+	}
+
+	public boolean containsHop(TemplateType type, long hopID) {
+		if(!_cplans.containsKey(type))
+			return false;
+		for (LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> cpplans : _cplans.get(type) )
+			if(cpplans.containsKey(hopID))
+				return true;
+		
+		return false;
+	}
+	
+	public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> getTopLevelCplans()
+	{
+		if( _cplans.isEmpty() )
+			return new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
+			
+		//resolve conflicts, i.e., overlap, between template types 
+		resolvePlanConflicts(); 
+		
+		//extract top level (subsuming) cplans per type and operator chain
+		LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
+		for (TemplateType key : _cplans.keySet()) {
+			for (LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> intermediateCplans : _cplans.get(key)) {
+				Entry<Long, Pair<Hop[],CNodeTpl>> cplan = TemplateUtils.getTopLevelCpplan(intermediateCplans);
+				if(cplan !=null)
+					ret.put(cplan.getKey(), cplan.getValue());			
+			}
+		}
+		
+		//merge top level plans if possible //TODO move to rowagg template
+		ret = mergeRowAggregateCellwisePlans(ret);
+		
+		return ret;
+	}
+	
+	/**
+	 * Resolves conflicts between overlapping cplans of different types.
+	 * 
+	 */
+	private void resolvePlanConflicts()
+	{
+		//get different plan categories
+		ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> cellwisePlans = _cplans.get(TemplateType.CellTpl);
+		ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> outerprodPlans = _cplans.get(TemplateType.OuterProductTpl);
+		ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> rowaggPlans = _cplans.get(TemplateType.RowAggTpl);
+		
+		//prefer outer product plans over cellwise plans -> remove overlap
+		if( cellwisePlans != null && outerprodPlans != null ) {
+			for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> outerprodCplan : outerprodPlans ) {
+				for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> map : cellwisePlans )
+					for( Long key : outerprodCplan.keySet() )
+						map.remove(key);
+			}		
+		}
+		
+		//prefer row aggregate plans over cellwise plans -> remove overlap
+		if( cellwisePlans != null && rowaggPlans != null ) {
+			for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> rowaggCplan : rowaggPlans ) {
+				for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> map : cellwisePlans )
+					for( Long key : rowaggCplan.keySet() )
+						map.remove(key);
+			}	
+		}
+	}
+	
+	private static LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> mergeRowAggregateCellwisePlans(LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> plans)
+	{
+		LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>(plans);
+		
+		//extract row aggregate templates
+		HashMap<Long, Pair<Hop[],CNodeTpl>> rowaggPlans = new HashMap<Long, Pair<Hop[],CNodeTpl>>();
+		for( Entry<Long, Pair<Hop[],CNodeTpl>> e : plans.entrySet() )
+			if( e.getValue().getValue() instanceof CNodeRowAggVector )
+				rowaggPlans.put(e.getKey(), e.getValue());
+		
+		//probe and merge row aggregate secondary inputs (by definition vectors)
+		for( Entry<Long, Pair<Hop[],CNodeTpl>> e : rowaggPlans.entrySet() ) {
+			//check all inputs for existing cell plans
+			Hop[] inputs = e.getValue().getKey();
+			for( int i=1; i<inputs.length; i++ ) {
+				long inhopID = inputs[i].getHopID();
+				if( ret.containsKey(inhopID) && ret.get(inhopID).getValue() instanceof CNodeCell
+					&& !((CNodeCell)ret.get(inhopID).getValue()).hasMultipleConsumers() ) 
+				{
+					//merge row agg template
+					CNodeRowAggVector rowaggtpl = (CNodeRowAggVector) e.getValue().getValue();
+					CNodeCell celltpl = (CNodeCell)ret.get(inhopID).getValue();
+					celltpl.getInput().get(0).setDataType(DataType.MATRIX);
+					rowaggtpl.rReplaceDataNode(rowaggtpl.getOutput(), inhopID, celltpl.getOutput());
+					rowaggtpl.rInsertLookupNode(rowaggtpl.getOutput(), 
+						((CNodeData)celltpl.getInput().get(0)).getHopID(), new HashMap<Long, CNode>());
+					for( CNode input : celltpl.getInput() )
+						rowaggtpl.addInput(input);
+					HashSet<Long> inputIDs = TemplateUtils.rGetInputHopIDs(rowaggtpl.getOutput(), new HashSet<Long>());
+					Hop[] hops = TemplateUtils.mergeDistinct(inputIDs, inputs, ret.get(inhopID).getKey());
+					e.getValue().setKey(hops);
+					
+					//remove cell template 
+					ret.remove(inhopID);
+				}
+			}
+		}
+		
+		return ret;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/OuterProductTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/OuterProductTpl.java b/src/main/java/org/apache/sysml/hops/codegen/template/OuterProductTpl.java
new file mode 100644
index 0000000..c202d3c
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/OuterProductTpl.java
@@ -0,0 +1,489 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import org.apache.sysml.api.DMLException;
+import org.apache.sysml.hops.AggBinaryOp;
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.Hop.OpOp2;
+import org.apache.sysml.hops.ReorgOp;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.ReOrgOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public class OuterProductTpl extends BaseTpl {
+	
+	public OuterProductTpl() {
+		super(TemplateType.OuterProductTpl);
+	}
+	
+	private List<OpOp2> sparseDrivers = new ArrayList<OpOp2>(Arrays.asList(OpOp2.MULT, OpOp2.DIV));	
+	private OutProdType _outerProductType = null;
+	private boolean _transposeOutput = false;
+	private boolean _transposeInput = false;
+	
+	@Override
+	public boolean openTpl(Hop hop) {
+		// outerproduct ( output dimensions is greater than the common dimension) 
+		return ( hop instanceof AggBinaryOp && ((AggBinaryOp)hop).isMatrixMultiply() && hop.dimsKnown()
+				&& hop.getInput().get(0).dimsKnown() && hop.getInput().get(1).dimsKnown()
+				&& (hop.getDim1() > hop.getInput().get(0).getDim2() && hop.getDim2() > hop.getInput().get(1).getDim1()) );
+	}
+
+	@Override
+	public boolean findTplBoundaries(Hop h, CplanRegister cplanRegister) { 
+		_endHop = h;//outerProduct tpl starts with endHop
+		HashMap<String,Hop> uniqueMatrixInputs = new HashMap<String,Hop>();
+		uniqueMatrixInputs.put("U",  h.getInput().get(0));
+		if( h.getInput().get(1) instanceof ReorgOp && ((ReorgOp)h.getInput().get(1)).getOp() == ReOrgOp.TRANSPOSE )
+			uniqueMatrixInputs.put("V",  h.getInput().get(1).getInput().get(0));
+		else
+		{
+			_transposeInput = true; // we need to transpose V to be tall and skinny
+			uniqueMatrixInputs.put("V",  h.getInput().get(1));
+		}
+		rfindOuterProduct(_endHop, _endHop, uniqueMatrixInputs, h.getDim1(), h.getDim2(), new HashSet<Long>());
+		
+		if(uniqueMatrixInputs.size() == 3 && _initialHop != null && _initialHop != _endHop )	//sanity check
+		{
+			//check if added matrices can be inferred from input matrices for example (X!=0) or abs(X) are not different from X
+			Hop commonChild = null;
+			if(! _adddedMatrices.isEmpty() ) {
+				//if addedMatrices does not have a common child with input X then do not compile
+				commonChild = TemplateUtils.commonChild(_adddedMatrices,uniqueMatrixInputs.get("X"));
+				if(commonChild == null ) // there are multiple matrices involved other than X
+						return false;
+			}
+			if(commonChild != null) {
+				_matrixInputs.add(commonChild); //add common child as the major input matrix
+				_adddedMatrices.add(uniqueMatrixInputs.get("X")); // put unique matrix as one of the additional matrices that is a chain of cell wise operations for the input matrix
+			}
+			else {
+				_matrixInputs.add(uniqueMatrixInputs.get("X")); //major matrix is the sparse driver
+			}
+			_matrixInputs.add(uniqueMatrixInputs.get("U"));
+			
+			if(_transposeInput) {
+				ReorgOp transposeV = HopRewriteUtils.createTranspose(uniqueMatrixInputs.get("V"));
+				//ReorgOp transposeV = new ReorgOp("", uniqueMatrixInputs.get("V").getDataType(), uniqueMatrixInputs.get("V").getValueType(), ReOrgOp.TRANSPOSE, uniqueMatrixInputs.get("V"));
+				_matrixInputs.add(transposeV);
+			}
+			else {
+				_matrixInputs.add(uniqueMatrixInputs.get("V"));
+			}
+			
+			
+			//add also added matrices so that they can be interpreted as inputs
+			for(Hop addedMatrix : _adddedMatrices)
+				if(!_matrixInputs.contains(addedMatrix))
+					_matrixInputs.add(addedMatrix);
+		
+			//add the children of _endHop ( this will handle the case for wdivmm right when I add the both t(V) and V as inputs
+			for (Hop hop: _endHop.getInput())
+				_matrixInputs.add(hop);
+			
+			return true;
+		}
+		else
+			return false;
+		
+	}	
+	private void rfindOuterProduct(Hop child, Hop h, HashMap<String,Hop> uniqueMatrixInputs, long outerProductDim1, long outerProductDim2, HashSet<Long> memo)
+	{
+		if(memo.contains(h.getHopID()))
+			return;
+		
+		if( ( h instanceof UnaryOp || h instanceof BinaryOp  )  //unary operation or binary operation
+				&& 	h.getDataType() == DataType.MATRIX			 // Output is a matrix
+				&& h.getDim1() == outerProductDim1 && h.getDim2() == outerProductDim2 // output is the same size as the matrix
+				&& TemplateUtils.isOperationSupported(h))  // operation is supported in codegen
+		{
+			if(h instanceof BinaryOp)
+			{
+				
+				// find the other child rather than the one that called the parent
+				Hop otherChild = h.getInput().get(0) !=  child ? h.getInput().get(0) : h.getInput().get(1);
+				
+				//if scalar or vector then we fuse it similar to the way we fuse celltpl,
+				if(TemplateUtils.isVectorOrScalar(otherChild))
+				{
+					_initialHop = h;
+					_outerProductType = OutProdType.CELLWISE_OUTER_PRODUCT;
+
+				}
+				// other child is a  matrix
+				else
+				{
+					//if the binary operation is sparse safe (mult, div)
+					if(sparseDrivers.contains(((BinaryOp)h).getOp()) ) 
+					{
+						if(!uniqueMatrixInputs.containsKey("X"))
+						{
+							//extra sanity check
+							if(otherChild.getDim1() == outerProductDim1 && otherChild.getDim2() == outerProductDim2) {
+								uniqueMatrixInputs.put("X", otherChild);
+								_initialHop = h;
+							}
+							else { //matrix size does not match what is expected for X
+								return; 
+							}
+						}
+					}
+					else {
+						_adddedMatrices.add(otherChild);
+					}
+				}
+			}
+		}
+		
+		if(  h instanceof AggBinaryOp && ((AggBinaryOp) h).isMatrixMultiply() && h != child) //make sure that the AggBinaryOp is not the same as the outerproduct that triggered this method
+		{
+			if(memo.contains(h.getInput().get(0).getHopID())) { // if current node is the parent for the left child then it is right matrix multiply
+			
+				if (h.getInput().get(1) == uniqueMatrixInputs.get("V") )//right operand is V
+				{
+					_initialHop = h;
+					_outerProductType = OutProdType.RIGHT_OUTER_PRODUCT;
+					return;
+				}
+				//right operand is t(V)
+				else if(h.getInput().get(1) instanceof ReorgOp && ((ReorgOp)h.getInput().get(1)).getOp() == ReOrgOp.TRANSPOSE && h.getInput().get(1).getInput().get(0) == uniqueMatrixInputs.get("V") )
+				{
+					//replace V with T(V)
+					uniqueMatrixInputs.put("V", h.getInput().get(1));
+					_transposeInput = false; //no need to transpose Input
+					_initialHop = h;
+					_outerProductType = OutProdType.RIGHT_OUTER_PRODUCT;
+					return;
+				}
+				else
+				{
+					_initialHop = h.getInput().get(0); // set the child that was processed
+					return;	
+				}
+			}
+			else {//left matrix multiply
+				
+				//left is T(U) 
+				if (h.getInput().get(0) instanceof ReorgOp && ((ReorgOp)h.getInput().get(0)).getOp() == ReOrgOp.TRANSPOSE && h.getInput().get(0).getInput().get(0) == uniqueMatrixInputs.get("U") ) 
+				{
+					_initialHop = h;
+					_outerProductType = OutProdType.LEFT_OUTER_PRODUCT;
+					//T(T(U) %*% ..)
+					for(Hop hParent : h.getParent())
+						if(hParent instanceof ReorgOp && ((ReorgOp)hParent).getOp() == ReOrgOp.TRANSPOSE) {
+							_initialHop = hParent; // set the transpose hop
+							return;
+						}	
+					_transposeOutput = true;
+					return;
+				}
+				else {
+					_initialHop = h.getInput().get(1); // set the child that was processed
+					return;	
+				}
+			}
+		}
+		
+		if( h instanceof AggUnaryOp && ((AggUnaryOp) h).getOp() == AggOp.SUM 
+			&& ((AggUnaryOp) h).getDirection() == Direction.RowCol)
+		{
+			_initialHop = h;
+			_outerProductType = OutProdType.AGG_OUTER_PRODUCT;
+			return;
+		}
+		
+		memo.add(h.getHopID());
+		//process parents recursively
+		for( Hop parent : h.getParent())
+			rfindOuterProduct(h, parent,uniqueMatrixInputs, outerProductDim1,outerProductDim2, memo);
+	}
+	
+	////////////////Helper methods for finding boundaries 
+	private OutProdType getOuterProductType(Hop X, Hop U, Hop V, Hop out)
+	{
+		if (_outerProductType != null)
+			return _outerProductType;
+				
+		
+		//extra checks to infer type
+		if (out.getDataType() == DataType.SCALAR) // sum
+		{
+			_outerProductType = OutProdType.AGG_OUTER_PRODUCT;
+		}
+		else if( isDimsEqual(out,V) && out instanceof ReorgOp) // the second condition is added because sometimes V and U might be same dimensions if the dims of X are equal
+		{
+			_outerProductType = OutProdType.LEFT_OUTER_PRODUCT;
+		}
+		else if( isDimsEqual(out,U))
+		{
+			_outerProductType = OutProdType.RIGHT_OUTER_PRODUCT;
+		}
+		else if ( isDimsEqual(out,X) )
+		{
+			_outerProductType = OutProdType.CELLWISE_OUTER_PRODUCT;
+		}
+		
+		return _outerProductType;
+	}
+	
+	private static boolean isDimsEqual(Hop hop1, Hop hop2)
+	{
+		if(hop1.getDim1() == hop2.getDim1() && hop1.getDim2() == hop2.getDim2())
+			return true;
+		return false;
+	}
+	
+	@Override
+	public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> constructTplCplan(boolean compileLiterals) throws DMLException {
+
+		//re-assign the dimensions of inputs to match the generated code dimensions
+
+		//matrix X is a scalar in the generated code
+		_initialCnodes.add(new CNodeData(_matrixInputs.get(0), 1,1,DataType.SCALAR));
+		
+		//matrix V
+		_initialCnodes.add(new CNodeData(_matrixInputs.get(1), 1,(int)_matrixInputs.get(1).getDim2(), DataType.MATRIX));
+		
+		//matrix V
+		_initialCnodes.add(new CNodeData(_matrixInputs.get(2), 1,(int)_matrixInputs.get(2).getDim2(),DataType.MATRIX));
+		
+		rConstructOuterProdCplan(_initialHop, _initialHop, new HashSet<Long>(), compileLiterals);
+		return _cpplans;		
+	}
+	
+	private void rConstructOuterProdCplan(Hop root, Hop hop, HashSet<Long> memo, boolean compileLiterals) throws DMLException
+	{
+		if( memo.contains(hop.getHopID()) )
+			return;
+		//process childs recursively
+		for( Hop c : hop.getInput() )
+			rConstructOuterProdCplan(root, c, memo, compileLiterals);
+		
+		//organize the main inputs
+		Hop X, U, V;
+		X = _matrixInputs.get(0);
+		U = _matrixInputs.get(1);
+		V = _matrixInputs.get(2);
+		if(hop==_endHop)
+			_endHopReached = true;
+		
+		 // first hop to enter here should be _endHop
+		if(TemplateUtils.inputsAreGenerated(hop,_matrixInputs,_cpplans) && _endHopReached)  // if direct children are DataGenOps, literals, or already in the cpplans then we are ready to generate code
+		{
+			CNodeOuterProduct outerProdTmpl = null;
+			
+			//Fetch operands
+			CNode out = null;
+			ArrayList<CNode> addedCNodes = new ArrayList<CNode>();
+			ArrayList<Hop> addedHops = new ArrayList<Hop>();
+			ArrayList<CNode> cnodeData = TemplateUtils.fetchOperands(hop, _cpplans, addedCNodes, addedHops, _initialCnodes, compileLiterals);
+			
+			//if operands are scalar or independent from X 
+			boolean independentOperands = hop != root && (hop.getDataType() == DataType.SCALAR || TemplateUtils.isOperandsIndependent(cnodeData, addedHops, new String[]{_matrixInputs.get(0).getName(),_matrixInputs.get(1).getName(),_matrixInputs.get(2).getName()}));
+			if(!independentOperands)
+			{
+				if(hop instanceof UnaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					
+					//Primitive Operation has the same name as Hop Type OpOp1
+					String primitiveOpName = ((UnaryOp)hop).getOp().toString();
+					out = new CNodeUnary(cdata1, UnaryType.valueOf(primitiveOpName));
+				}
+				else if(hop instanceof BinaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					CNode cdata2 = cnodeData.get(1);
+					
+					//Primitive Operation has the same name as Hop Type OpOp2
+					String primitiveOpName = ((BinaryOp)hop).getOp().toString();
+					
+					if( (cdata1.getNumRows() > 1 && cdata1.getNumCols() == 1) || (cdata1.getNumRows() == 1 && cdata1.getNumCols() > 1) )
+					{
+						//second argument is always the vector
+						cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP);
+						//out = new CNodeBinary(tmp, cdata2, BinType.valueOf(primitiveOpName));
+					}
+					//cdata1 is a matrix 
+					else if ( (cdata1.getNumRows() > 1 && cdata1.getNumCols() > 1) )
+					{
+						CellTpl cellTpl = new CellTpl();
+						cdata1 = cellTpl.fuseCellWise(hop.getInput().get(0), _matrixInputs.get(0), compileLiterals); // second argument is always matrix X
+						if (cdata1 == null)
+							return;
+					}
+					//cdata2 is vector
+					//else if( cdata2 instanceof CNodeData && (((CNodeData)cdata2).getNumRows() > 1 && ((CNodeData)cdata2).getNumCols() == 1) || ( ((CNodeData)cdata2).getNumRows() == 1 && ((CNodeData)cdata2).getNumCols() > 1  ))
+					if( (cdata2.getNumRows() > 1 && cdata2.getNumCols() == 1) || (cdata2.getNumRows() == 1 && cdata2.getNumCols() > 1) )
+					{
+						cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP);
+						//out = new CNodeBinary(cdata1, tmp, BinType.valueOf(primitiveOpName));
+					}
+					//cdata2 is a matrix 
+					else if ( (cdata2.getNumRows() > 1 && cdata2.getNumCols() > 1) )
+					{
+						CellTpl cellTpl = new CellTpl();
+						cdata2 = cellTpl.fuseCellWise(hop.getInput().get(1), _matrixInputs.get(0), compileLiterals); // second argument is always matrix X
+						if (cdata2 == null)
+							return;
+					}
+					out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
+				}
+				else if(hop instanceof AggBinaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					CNode cdata2 = cnodeData.get(1); // remember that we already fetched what is under transpose
+					
+					//outerproduct U%*%t(V) then we should have passsed in V as the input
+					if(hop.getInput().get(0) == U && hop.getInput().get(1) instanceof ReorgOp && hop.getInput().get(1).getInput().get(0)  == V)
+					{
+						//re-assign cdata2 to read V instead of t(V)
+						cdata2 = _initialCnodes.get(2); // the initialCNodes holds V
+						out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
+					}
+					
+					//outerproduct U%*%V then we should have passsed in trnasposeV as the input
+					else if(hop.getInput().get(0) == U &&  V instanceof ReorgOp && V.getInput().get(0)== hop.getInput().get(1))
+					{
+						//re-assign cdata2 to read t(V) instead of V
+						cdata2 = _initialCnodes.get(2); // the initialCNodes holds transpose of V
+						out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
+					}
+					//outerproduct U%*%V  but not right wdivmm so we did not pass T(V)
+					else if(hop.getInput().get(0) == U &&  hop.getInput().get(1) == V )
+					{
+						//re-assign cdata2 to read t(V) instead of V
+						cdata2 = _initialCnodes.get(2); // the initialCNodes holds transpose of V
+						out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
+					}
+					
+					//left outerproduct (i.e., left operand is T(U) )
+					else if(hop.getInput().get(0) instanceof ReorgOp && hop.getInput().get(0).getInput().get(0)  == U)
+					{
+						//scalar is cdata2
+						out = new CNodeBinary(cdata2, cdata1, BinType.VECT_MULT_ADD);
+					}
+					
+					//right outerproduct (i.e., right operand is V )
+					else if(hop.getInput().get(1) != U && hop.getInput().get(1) == V)
+					{
+						cdata2 = _initialCnodes.get(2);
+						out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+					}
+					
+					//right outerproduct (i.e., right operand is t(V) )
+					else if(hop.getInput().get(1) instanceof ReorgOp && hop.getInput().get(1).getInput().get(0)  == V)
+					{
+						cdata2 = _initialCnodes.get(2);
+						out = new CNodeBinary(cdata1, cdata2, BinType.VECT_MULT_ADD);
+					}
+				}
+				else if ( hop instanceof ReorgOp && ((ReorgOp)hop).getOp() == ReOrgOp.TRANSPOSE && root == hop) // if transpose wire the oinput in T( T(U ...)
+				{
+					out =  cnodeData.get(0);
+				}
+				else if (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getOp() == AggOp.SUM && root == hop
+					&& ((AggUnaryOp)hop).getDirection() == Direction.RowCol )
+				{
+					out =  cnodeData.get(0);
+				}
+			}
+			// wire output to the template
+			if(out != null || independentOperands)
+			{
+				if(_cpplans.isEmpty())
+				{
+					//first initialization has to have the first variable as input
+					ArrayList<CNode> initialInputs = new ArrayList<CNode>();
+					
+					if(independentOperands) // pass the hop itself as an input instead of its children
+					{
+						CNode c =  new CNodeData(hop);
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.add(c);
+						outerProdTmpl =  new CNodeOuterProduct(initialInputs, c); 
+						outerProdTmpl.setOutProdType(getOuterProductType(X, U, V, root));
+						outerProdTmpl.setTransposeOutput(_transposeOutput);
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(new Hop[] {X,U,V,hop} ,outerProdTmpl));
+					}
+					else
+					{
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.addAll(cnodeData);
+						outerProdTmpl =  new CNodeOuterProduct(initialInputs, out); 
+						outerProdTmpl.setOutProdType(getOuterProductType(X, U, V, root));
+						outerProdTmpl.setTransposeOutput(_transposeOutput);
+								
+						Hop[] hopArray = new Hop[addedHops.size()+3];
+						hopArray[0] = X;
+						hopArray[1] = U;
+						hopArray[2] = V;
+						
+						System.arraycopy( addedHops.toArray(), 0, hopArray, 3, addedHops.size());
+						
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(hopArray,outerProdTmpl));
+					}
+				}
+				else
+				{
+					if(independentOperands)
+					{
+						CNode c =  new CNodeData(hop);
+						//clear Operands
+						addedCNodes.clear();
+						addedHops.clear();
+						
+						//added the current hop as the input
+						addedCNodes.add(c);
+						addedHops.add(hop);
+						out = c;
+					}
+					
+					//wire the output to existing or new template	
+					TemplateUtils.setOutputToExistingTemplate(hop, out, _cpplans, addedCNodes, addedHops);
+				}
+			}
+			memo.add(hop.getHopID());
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/RowAggTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/RowAggTpl.java b/src/main/java/org/apache/sysml/hops/codegen/template/RowAggTpl.java
new file mode 100644
index 0000000..0aff9ae
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/RowAggTpl.java
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+
+import org.apache.sysml.api.DMLException;
+import org.apache.sysml.hops.AggBinaryOp;
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.DataGenOp;
+import org.apache.sysml.hops.DataOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.ReorgOp;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeRowAggVector;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.OpOp2;
+import org.apache.sysml.hops.Hop.ReOrgOp;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public class RowAggTpl extends BaseTpl {
+
+	public RowAggTpl() {
+		super(TemplateType.RowAggTpl);
+	}
+	
+	@Override
+	public boolean openTpl(Hop hop) {
+		if ( (hop instanceof AggBinaryOp || hop instanceof AggUnaryOp) // An aggregate operation  			  
+			&& ( (hop.getDim1()==1 && hop.getDim2()!=1) || (hop.getDim1()!=1 && hop.getDim2()==1) )  )// the output is a vector  
+			return true;
+		return false;
+	}
+
+	@Override
+	public boolean findTplBoundaries(Hop initialHop, CplanRegister cplanRegister) {
+		_initialHop = initialHop;
+		if(initialHop instanceof AggBinaryOp) {
+			// for simplicity we assume that the first operand should be t(X) however, it could be later on W.T(X)
+			if(initialHop.getInput().get(0) instanceof ReorgOp && ((ReorgOp)initialHop.getInput().get(0)).getOp()== ReOrgOp.TRANSPOSE  )
+				_matrixInputs.add(initialHop.getInput().get(0).getInput().get(0)); //add what is under the transpose
+			else
+				return false; 
+		}
+		rFindRowAggPattern(initialHop, new HashSet<Long>());
+		
+		if(cplanRegister.containsHop(TemplateType.RowAggTpl, initialHop.getHopID()))
+			return false;
+		
+		return (_endHop != null);
+	}
+	
+	
+	private void rFindRowAggPattern(Hop h, HashSet<Long> memo)
+	{
+		if(memo.contains(h.getHopID()) || h.getDataType() == DataType.SCALAR 
+			|| h instanceof DataOp || h instanceof DataGenOp || h instanceof LiteralOp) {
+			return;
+		}
+		
+		boolean continueTraversing = false;
+		if (h instanceof AggBinaryOp)
+		{
+			if(h != _initialHop) {
+				//T(X) % ..... X %*% v ,check that X is the same as what we saw previously under transpose
+				if( h.getInput().get(0).equals(_matrixInputs.get(0)) && TemplateUtils.isVector(h.getInput().get(1)) ) {
+					_endHop = h;
+				}
+			}
+			else {
+				continueTraversing = true;
+			}
+		}
+		// if initial hop is colSums continue
+		else if(h instanceof AggUnaryOp && (((AggUnaryOp)_initialHop).getDirection() == Direction.Col && ((AggUnaryOp)_initialHop).getOp() == AggOp.SUM ) && h == _initialHop)
+		{
+			continueTraversing=true;
+		}
+		//rowSums(X)
+		else if(h instanceof AggUnaryOp && ((AggUnaryOp)h).getDirection() == Direction.Row && ((AggUnaryOp)h).getOp() == AggOp.SUM )
+		{
+			// check if root pattern is colsums
+			if((((AggUnaryOp)_initialHop).getDirection() == Direction.Col && ((AggUnaryOp)_initialHop).getOp() == AggOp.SUM ))
+			{
+				
+				//TODO Now the pattern is limited to finding rowSums 
+				_matrixInputs.add(h.getInput().get(0));
+				_endHop = h;
+			}
+		}
+		// unary operation || binary operation with first input as a matrix || binary operation with second input as a matrix 
+		else if( ( h instanceof UnaryOp || (h instanceof BinaryOp && h.getInput().get(0).getDataType() == DataType.MATRIX  &&  TemplateUtils.isVectorOrScalar(h.getInput().get(1))) || (h instanceof BinaryOp && TemplateUtils.isVectorOrScalar(h.getInput().get(0))  &&  h.getInput().get(1).getDataType() == DataType.MATRIX)  )  //unary operation or binary operaiton with one matrix and a scalar
+				&& 	h.getDataType() == DataType.MATRIX		 // Output is a matrix
+				&&  TemplateUtils.isOperationSupported(h) )	 //Operation is supported in codegen
+		{
+			continueTraversing = true;
+		}
+		
+		//check if we should continue traversing
+		if(!continueTraversing)
+		{
+			return; // stop traversing if conditions does not apply 
+		}
+		else
+		{
+			//process childs recursively
+			for( Hop in : h.getInput() )
+				rFindRowAggPattern(in,memo);
+		}
+	    memo.add(h.getHopID());
+	}
+	
+	@Override
+	public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> constructTplCplan(boolean compileLiterals)
+		throws DMLException {
+		
+		//re-assign the dimensions of inputs to match the generated code dimensions
+		_initialCnodes.add(new CNodeData(_matrixInputs.get(0)));
+		
+		rConstructRowAggCplan(_initialHop,_initialHop,new HashSet<Long>(), compileLiterals);
+		return _cpplans;
+	}
+	
+	private void rConstructRowAggCplan(Hop root, Hop hop, HashSet<Long> memo, boolean compileLiterals) throws DMLException
+	{
+		if( memo.contains(hop.getHopID()) )
+			return;
+		//process childs recursively
+		for( Hop c : hop.getInput() )
+			rConstructRowAggCplan(root, c, memo, compileLiterals);
+		if(hop == _endHop)
+			_endHopReached = true;
+		
+		 // first hop to enter here should be _endHop
+		if(TemplateUtils.inputsAreGenerated(hop,_matrixInputs,_cpplans) && _endHopReached)  // if direct children are DataGenOps, literals, or already in the cpplans then we are ready to generate code
+		{
+			CNodeRowAggVector rowTmpl = null;
+			
+			//Fetch operands
+			CNode out = null;
+			ArrayList<CNode> addedCNodes = new ArrayList<CNode>();
+			ArrayList<Hop> addedHops = new ArrayList<Hop>();
+			ArrayList<CNode> cnodeData = TemplateUtils.fetchOperands(hop, _cpplans, addedCNodes, addedHops, _initialCnodes, compileLiterals);
+			
+			//if operands are scalar or independent from X 
+			boolean independentOperands = hop.getDataType() == DataType.SCALAR 
+					|| TemplateUtils.isOperandsIndependent(cnodeData, addedHops, new String[] {_matrixInputs.get(0).getName()});
+			
+			if(!independentOperands)
+			{
+			
+				if(hop instanceof AggUnaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					//set the out cnode based on the operation
+					if(  ((AggUnaryOp)hop).getDirection() == Direction.Row && ((AggUnaryOp)hop).getOp() == AggOp.SUM  ) //RowSums 
+					{
+						if(hop.getInput().get(0).getDim2()==1)
+							out = (cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new CNodeUnary(cdata1,UnaryType.LOOKUP);
+						else
+							out = new CNodeUnary(cdata1, UnaryType.ROW_SUMS);
+					}
+					// if colsums is the root hop, wire the input to the out because colsums it is done automatically by the template
+					else  if (((AggUnaryOp)hop).getDirection() == Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM && root == hop)
+					{
+						//vector div add without temporary copy
+						if(cdata1 instanceof CNodeBinary && ((CNodeBinary)cdata1).getType()==BinType.VECT_DIV_SCALAR)
+							out = new CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), BinType.VECT_DIV_ADD);
+						else	
+							out = cdata1;
+					}
+				}
+				else if(hop instanceof AggBinaryOp)
+				{
+					//Fetch operands specific to the operation
+					CNode cdata1 = cnodeData.get(0);
+					CNode cdata2 = cnodeData.get(1);
+					
+					//choose the operation based on the transpose
+					if( hop.getInput().get(0) instanceof ReorgOp && ((ReorgOp)hop.getInput().get(0)).getOp()==ReOrgOp.TRANSPOSE )
+					{
+						//fetch the data inside the transpose
+						//cdata1 = new CNodeData(hop.getInput().get(0).getInput().get(0).getName(), (int)hop.getInput().get(0).getInput().get(0).getDim1(), (int)hop.getInput().get(0).getInput().get(0).getDim2());
+						out = new CNodeBinary(cdata2, cdata1, BinType.VECT_MULT_ADD);
+					}
+					else
+					{
+						if(hop.getInput().get(0).getDim2()==1 && hop.getInput().get(1).getDim2()==1)
+							out = new CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new CNodeUnary(cdata1, UnaryType.LOOKUP0),
+								(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, UnaryType.LOOKUP0), BinType.MULT);
+						else	
+							out = new CNodeBinary(cdata1, cdata2, BinType.DOT_PRODUCT);
+					}
+				}
+				else if(hop instanceof BinaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					CNode cdata2 = cnodeData.get(1);
+					
+					// if one input is a matrix then we need to do vector by scalar operations
+					if(hop.getInput().get(0).getDim1() > 1 && hop.getInput().get(0).getDim2() > 1 )
+					{
+						if (((BinaryOp)hop).getOp()== OpOp2.DIV)
+							//CNode generatedScalar = new CNodeData("1", 0, 0); // generate literal in order to rewrite the div to x * 1/y
+							//CNode outScalar = new CNodeBinary(generatedScalar, cdata2, BinType.SCALAR_DIVIDE);
+							//out = new CNodeBinary(outScalar, cdata1, BinType.VECT_MULT_ADD);
+							out = new CNodeBinary(cdata1, cdata2, BinType.VECT_DIV_SCALAR);
+						
+					}
+					else //one input is a vector/scalar other is a scalar
+					{
+						//Primitive Operation has the same name as Hop Type OpOp2
+						String primitiveOpName = ((BinaryOp)hop).getOp().toString();
+						
+						if( (cdata1.getNumRows() > 1 && cdata1.getNumCols() == 1) || (cdata1.getNumRows() == 1 && cdata1.getNumCols() > 1) )
+						{
+							//second argument is always the vector
+							cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP);
+							//out = new CNodeBinary(tmp, cdata2, BinType.valueOf(primitiveOpName));
+						}
+						//cdata2 is vector
+						//else if( cdata2 instanceof CNodeData && (((CNodeData)cdata2).getNumRows() > 1 && ((CNodeData)cdata2).getNumCols() == 1) || ( ((CNodeData)cdata2).getNumRows() == 1 && ((CNodeData)cdata2).getNumCols() > 1  ))
+						if( (cdata2.getNumRows() > 1 && cdata2.getNumCols() == 1) || (cdata2.getNumRows() == 1 && cdata2.getNumCols() > 1) )
+						{
+							cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP);
+							//out = new CNodeBinary(cdata1, tmp, BinType.valueOf(primitiveOpName));
+						}
+						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));	
+					}
+					
+				}
+				
+				if( out.getDataType().isMatrix() ) {
+					out.setNumRows(hop.getDim1());
+					out.setNumCols(hop.getDim2());
+				}
+			}
+			// wire output to the template
+			if(out != null || independentOperands)
+			{
+				if(_cpplans.isEmpty())
+				{
+					//first initialization has to have the first variable as input
+					ArrayList<CNode> initialInputs = new ArrayList<CNode>();
+										
+					if(independentOperands) // pass the hop itself as an input instead of its children
+					{
+						CNode c =  new CNodeData(hop);
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.add(c);
+						rowTmpl =  new CNodeRowAggVector(initialInputs, c); 
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(new Hop[] {_matrixInputs.get(0),hop} ,rowTmpl));
+					}
+					else
+					{
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.addAll(cnodeData);
+						rowTmpl =  new CNodeRowAggVector(initialInputs, out); 
+						
+						//Hop[] hopArray = new Hop[hop.getInput().size()+1];
+						Hop[] hopArray = new Hop[addedHops.size()+1];
+						hopArray[0] = _matrixInputs.get(0);
+						
+						//System.arraycopy( hop.getInput().toArray(), 0, hopArray, 1, hop.getInput().size());
+						System.arraycopy( addedHops.toArray(), 0, hopArray, 1, addedHops.size());
+						
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(hopArray,rowTmpl));
+					}
+				}
+				else
+				{
+					if(independentOperands)
+					{
+						CNode c =  new CNodeData(hop);
+						//clear Operands
+						addedCNodes.clear();
+						addedHops.clear();
+						
+						//added the current hop as the input
+						addedCNodes.add(c);
+						addedHops.add(hop);
+						out = c;
+					}
+					//wire the output to existing or new template	
+					TemplateUtils.setOutputToExistingTemplate(hop, out, _cpplans, addedCNodes, addedHops);
+				}
+			}
+			memo.add(hop.getHopID());
+		}	
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
new file mode 100644
index 0000000..fd8a960
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.Map.Entry;
+
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.DataGenOp;
+import org.apache.sysml.hops.DataOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.ReorgOp;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.ReOrgOp;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeCell;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+public class TemplateUtils 
+{
+	public static boolean inputsAreGenerated(Hop parent, ArrayList<Hop> inputs, HashMap<Long, Pair<Hop[],CNodeTpl>> cpplans)
+	{		
+		if( parent instanceof DataOp || parent instanceof DataGenOp || parent instanceof LiteralOp || inputs.contains(parent) )
+			return false;
+	
+		for(Hop hop : parent.getInput() )
+			if(!inputs.contains(hop) && !(hop instanceof DataOp) && !(hop instanceof DataGenOp) && !(hop.getDataType()==DataType.SCALAR) && !isVector(hop) && !(cpplans.containsKey(hop.getHopID())) && !( hop instanceof ReorgOp && ((ReorgOp)hop).getOp() == ReOrgOp.TRANSPOSE && inputsAreGenerated(hop,inputs, cpplans) ))
+				return false;
+		return true;
+	}
+	
+	public static ArrayList<CNode> fetchOperands(Hop hop,  HashMap<Long, Pair<Hop[],CNodeTpl>> cpplans, ArrayList<CNode> addedCNodes, ArrayList<Hop> addedHops, ArrayList<CNodeData> initialCNodes, boolean compileLiterals)
+	{
+		ArrayList<CNode> cnodeData = new ArrayList<CNode>();
+		for (Hop h: hop.getInput())
+		{
+			CNode cdata = null;
+			
+			//CNodeData already in template inputs
+			for(CNodeData c : initialCNodes) {
+				if( c.getHopID() == h.getHopID() ) {
+					cdata = c;
+					break;
+				}
+			}
+			
+			if(cdata != null)
+			{
+				cnodeData.add(cdata);
+				continue;
+			}
+			//hop already in the cplan
+			else if(cpplans.containsKey(h.getHopID()))
+			{
+				cdata = cpplans.get(h.getHopID()).getValue().getOutput();
+			}
+			else if(h instanceof ReorgOp && ((ReorgOp)h).getOp()==ReOrgOp.TRANSPOSE )
+			{
+				//fetch what is under the transpose
+				Hop in = h.getInput().get(0);
+				cdata = new CNodeData(in);
+				if(in instanceof DataOp || in instanceof DataGenOp ) {
+					addedCNodes.add(cdata);
+					addedHops.add(in);
+				}
+			}
+			else
+			{
+				//note: only compile literals if forced or integer literals (likely constants) 
+				//to increase reuse potential on literal replacement during recompilation
+				cdata = new CNodeData(h);
+				cdata.setLiteral(h instanceof LiteralOp && (compileLiterals 
+					|| UtilFunctions.isIntegerNumber(((LiteralOp)h).getStringValue())));
+				if( !cdata.isLiteral() ) {
+					addedCNodes.add(cdata);
+					addedHops.add(h);
+				}
+			}
+			
+			cnodeData.add(cdata);
+		}
+		return cnodeData;
+	}
+	
+	public static void setOutputToExistingTemplate(Hop hop, CNode out,  HashMap<Long, Pair<Hop[],CNodeTpl>> cpplans, ArrayList<CNode> addedCNodes, ArrayList<Hop> addedHops)
+	{
+		//get the toplevel rowTemp
+		Entry<Long, Pair<Hop[],CNodeTpl>> cplan = null;
+		Iterator<Entry<Long, Pair<Hop[],CNodeTpl>>> iterator = cpplans.entrySet().iterator();
+		while (iterator.hasNext()) 
+			cplan = iterator.next();
+		
+		CNodeTpl tmpl = cplan.getValue().getValue().clone();
+		tmpl.setDataType(hop.getDataType());
+		
+		if(tmpl instanceof CNodeOuterProduct) {
+			((CNodeOuterProduct) tmpl).setOutProdType( ((CNodeOuterProduct)cplan.getValue().getValue()).getOutProdType());
+			((CNodeOuterProduct) tmpl).setTransposeOutput(((CNodeOuterProduct)cplan.getValue().getValue()).isTransposeOutput() );
+		}
+		else if( tmpl instanceof CNodeCell ) {
+			((CNodeCell)tmpl).setCellType(getCellType(hop));
+			((CNodeCell)tmpl).setMultipleConsumers(hop.getParent().size()>1);
+		}
+		
+		//add extra inputs
+		for(CNode c : addedCNodes)
+			tmpl.addInput(c);
+		
+		//modify addedHops if they exist
+		
+		Hop[] currentInputHops = cplan.getValue().getKey();
+		for (Hop h : currentInputHops)
+			if (addedHops.contains(h))
+				addedHops.remove(h);
+		
+		Hop[] extendedHopInputs = new Hop[cplan.getValue().getKey().length + addedHops.size()];
+		System.arraycopy(cplan.getValue().getKey(), 0, extendedHopInputs, 0, cplan.getValue().getKey().length);
+		for(int j=addedHops.size(); j > 0; j--)	
+			extendedHopInputs[extendedHopInputs.length-j] = addedHops.get(addedHops.size() - j);  //append the added hops to the end of the array
+	
+		//set the template output and add it to the cpplans
+		Pair<Hop[],CNodeTpl> pair = new Pair<Hop[],CNodeTpl>(extendedHopInputs,tmpl);
+		pair.getValue().setOutput(out);
+		cpplans.put(hop.getHopID(), pair);
+		
+	}
+
+	public static boolean isOperandsIndependent(ArrayList<CNode> cnodeData, ArrayList<Hop> addedHops, String[] varNames)
+	{
+		for(CNode c : cnodeData) {
+			// it is some variable inside the cplan // TODO needs to be modified because sometimes the varname is not null but the variable is in the cplan
+			if(c.getVarname() == null)
+				return false;
+			//if one of the operands is is any of the varnames // if one of the operands is T(X) this condition will apply as well because during fetch operands we fetch what is inside transpose 
+			for(String varName : varNames)
+				if(c.getVarname().equals(varName))
+					return false;
+		}
+		return true;
+	}
+	
+	public static Entry<Long, Pair<Hop[],CNodeTpl>> getTopLevelCpplan(HashMap<Long, Pair<Hop[],CNodeTpl>> cplans)
+	{
+		Entry<Long, Pair<Hop[],CNodeTpl>> ret = null;
+		
+		//get last entry (most fused operators) or special handling
+		boolean hasExp = false;
+		for( Entry<Long, Pair<Hop[],CNodeTpl>> e : cplans.entrySet() ) 
+		{ 
+			ret = e; //keep last seen entry
+			
+			//special handling overlapping fused operators with exp
+			hasExp |= (ret.getValue().getValue().getOutput() instanceof CNodeUnary
+					&& ((CNodeUnary)ret.getValue().getValue().getOutput()).getType()==UnaryType.EXP);
+			
+			if( hasExp && ret.getValue().getValue() instanceof CNodeCell
+				&& ((CNodeCell)ret.getValue().getValue()).hasMultipleConsumers() )
+				break;
+		}
+		
+		return ret;
+	}
+	
+	public static boolean isVector(Hop hop) {
+		return (hop.getDataType() == DataType.MATRIX 
+			&& (hop.getDim1() != 1 && hop.getDim2() == 1 
+			  || hop.getDim1() == 1 && hop.getDim2() != 1 ) );
+	}
+	
+	public static boolean isColVector(CNode hop) {
+		return (hop.getDataType() == DataType.MATRIX 
+			&& hop.getNumRows() != 1 && hop.getNumCols() == 1);
+	}
+	
+	public static boolean isRowVector(CNode hop) {
+		return (hop.getDataType() == DataType.MATRIX 
+			&& hop.getNumRows() == 1 && hop.getNumCols() != 1);
+	}
+	
+	public static boolean isMatrix(Hop hop) {
+		return (hop.getDataType() == DataType.MATRIX && hop.getDim1() != 1 && hop.getDim2()!=1);
+	}
+	
+	public static boolean isVectorOrScalar(Hop hop) {
+		return hop.dimsKnown() && (hop.getDataType() == DataType.SCALAR || isVector(hop) );
+	}
+	
+	public static boolean isBinaryMatrixRowVector(Hop hop) {
+		if( !(hop instanceof BinaryOp) )
+			return false;
+		Hop left = hop.getInput().get(0);
+		Hop right = hop.getInput().get(1);
+		return left.dimsKnown() && right.dimsKnown() 
+			&& left.getDataType().isMatrix() && right.getDataType().isMatrix()
+			&& left.getDim1() > right.getDim1();
+	}
+
+	public static boolean isOperationSupported(Hop h) {
+		if(h instanceof  UnaryOp)
+			return UnaryType.contains(((UnaryOp)h).getOp().toString());
+		else if(h instanceof BinaryOp)
+			return BinType.contains(((BinaryOp)h).getOp().toString());
+		else
+			return false;
+	}
+
+	private static void rfindChildren(Hop hop, HashSet<Hop> children ) {		
+		if( hop instanceof UnaryOp || (hop instanceof BinaryOp && hop.getInput().get(0).getDataType() == DataType.MATRIX  &&  TemplateUtils.isVectorOrScalar( hop.getInput().get(1))) || (hop instanceof BinaryOp && TemplateUtils.isVectorOrScalar( hop.getInput().get(0))  &&  hop.getInput().get(1).getDataType() == DataType.MATRIX)    //unary operation or binary operaiton with one matrix and a scalar
+					&& 	hop.getDataType() == DataType.MATRIX )
+		{	
+			if(!children.contains(hop))
+				children.add(hop);
+			Hop matrix = TemplateUtils.isMatrix(hop.getInput().get(0)) ? hop.getInput().get(0) : hop.getInput().get(1);
+			rfindChildren(matrix,children);
+		}
+		else 
+			children.add(hop);
+	}
+	
+	private static Hop findCommonChild(Hop hop1, Hop hop2) {
+		//this method assumes that each two nodes have at most one common child 
+		LinkedHashSet<Hop> children1 = new LinkedHashSet<Hop>();
+		LinkedHashSet<Hop> children2 = new LinkedHashSet<Hop>();
+		
+		rfindChildren(hop1, children1 );
+		rfindChildren(hop2, children2 );
+		
+		//iterate on one set and find the first common child in the other set
+		Iterator<Hop> iter = children1.iterator();
+		while (iter.hasNext()) {
+			Hop candidate = iter.next();
+			if(children2.contains(candidate))
+				return candidate;
+		}
+		return null;
+	}
+	
+	public static Hop commonChild(ArrayList<Hop> _adddedMatrices, Hop input) {
+		Hop currentChild = null;
+		//loop on every added matrix and find its common child with the input, if all of them have the same common child then return it, otherwise null 
+		for(Hop addedMatrix : _adddedMatrices)
+		{
+			Hop child = findCommonChild(addedMatrix,input);
+			if(child == null)  // did not find a common child
+				return null;
+			if(currentChild == null) // first common child to be seen
+				currentChild = child;
+			else if(child.getHopID() != currentChild.getHopID())
+				return null;
+		}
+		return currentChild;
+	}
+
+	public static HashSet<Long> rGetInputHopIDs( CNode node, HashSet<Long> ids ) {
+		if( node instanceof CNodeData && !node.isLiteral() )
+			ids.add(((CNodeData)node).getHopID());
+		
+		for( CNode c : node.getInput() )
+			rGetInputHopIDs(c, ids);
+			
+		return ids;
+	}
+	
+	public static Hop[] mergeDistinct(HashSet<Long> ids, Hop[] input1, Hop[] input2) {
+		Hop[] ret = new Hop[ids.size()];
+		int pos = 0;
+		for( Hop[] input : new Hop[][]{input1, input2} )
+			for( Hop c : input )
+				if( ids.contains(c.getHopID()) )
+					ret[pos++] = c; 
+		return ret;
+	}
+	
+	private static CellType getCellType(Hop hop) {
+		return (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getOp() == AggOp.SUM) ?
+			((((AggUnaryOp) hop).getDirection() == Direction.RowCol) ? 
+			CellType.FULL_AGG : CellType.ROW_AGG) : CellType.NO_AGG;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index 7f65ddd..802a382 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -253,6 +253,12 @@ public class HopRewriteUtils
 		child.getParent().add( parent );
 	}
 	
+	public static void rewireAllParentChildReferences( Hop hold, Hop hnew ) {
+		ArrayList<Hop> parents = new ArrayList<Hop>(hold.getParent());
+		for( Hop lparent : parents )
+			HopRewriteUtils.replaceChildReference(lparent, hold, hnew);	
+	}
+	
 	public static void replaceChildReference( Hop parent, Hop inOld, Hop inNew ) {
 		int pos = getChildReferencePos(parent, inOld);
 		removeChildReferenceByPos(parent, inOld, pos);
@@ -491,10 +497,12 @@ public class HopRewriteUtils
 			input2.getDataType().isMatrix() ? input2 : input1;
 		BinaryOp bop = new BinaryOp(mainInput.getName(), mainInput.getDataType(), 
 			mainInput.getValueType(), op, input1, input2);
+		//cleanup value type for relational operations
+		if( bop.isPPredOperation() && bop.getDataType().isScalar() )
+			bop.setValueType(ValueType.BOOLEAN);
 		bop.setOutputBlocksizes(mainInput.getRowsInBlock(), mainInput.getColsInBlock());
 		copyLineNumbers(mainInput, bop);
 		bop.refreshSizeInformation();	
-		
 		return bop;
 	}
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 558deb3..cea2c93 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -166,48 +166,21 @@ public class ConvolutionTransform extends Lop
 		}
 	}
 	
-	// Used by maxpool
-	public String getInstructions(String input, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
-		StringBuilder sb = new StringBuilder();
-		appendOpcode(sb);
-		sb.append( getInputs().get(0).prepInputOperand(input));
-		appendOperands(1, 13, output, sb);
-		return sb.toString();
-	}
-	
-	// Used by conv2d*, maxpool_bwd
-	public String getInstructions(String input, String dout, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
-		StringBuilder sb = new StringBuilder();
-		appendOpcode(sb);
-		sb.append( getInputs().get(0).prepInputOperand(input));
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(1).prepInputOperand(dout));
-		appendOperands(2, 14, output, sb);
-		return sb.toString();
-	}
-	
-	// Used by fused conv2d+bias_add
-	public String getInstructions(String input, String bias, String filter, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
+	@Override
+	public String getInstructions(String[] inputs, String output) throws LopsException {
 		StringBuilder sb = new StringBuilder();
 		appendOpcode(sb);
-		sb.append( getInputs().get(0).prepInputOperand(input));
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(1).prepInputOperand(bias));
-		sb.append( OPERAND_DELIMITOR );
-		sb.append( getInputs().get(2).prepInputOperand(filter));
-		appendOperands(3, 15, output, sb);
+		
+		for( int i=0; i<inputs.length-12; i++ ) {
+			if( i > 0 )
+				sb.append( OPERAND_DELIMITOR );
+			sb.append( getInputs().get(i).prepInputOperand(inputs[i]));
+		}
+		appendOperands(inputs.length-12, inputs.length, output, sb);
+		
 		return sb.toString();
 	}
-	
+
 	public void appendOpcode(StringBuilder sb) {
 		sb.append( getExecType() );
 		sb.append( OPERAND_DELIMITOR );

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/lops/Lop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/Lop.java b/src/main/java/org/apache/sysml/lops/Lop.java
index 567b0be..24f7ba3 100644
--- a/src/main/java/org/apache/sysml/lops/Lop.java
+++ b/src/main/java/org/apache/sysml/lops/Lop.java
@@ -59,6 +59,7 @@ public abstract class Lop
 		SortKeys, PickValues,
 		Checkpoint, 										//Spark persist into storage level
 		PlusMult, MinusMult,								//CP
+		SpoofFused,											//CP/SP generated fused operator
 		/** CP operation on a variable number of operands */
 		MULTIPLE_CP
 	};
@@ -418,6 +419,40 @@ public abstract class Lop
 		return outParams;
 	}
 	
+
+	/** Method should be overridden if needed
+	 * 
+	 * @param output output
+	 * @return instructions as string
+	 * @throws LopsException if LopsException occurs
+	 */
+	public String getInstructions(String output) throws LopsException {
+		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
+	}
+	
+	/** Method should be overridden if needed
+	 * 
+	 * @param input1 input 1
+	 * @param output output
+	 * @return instructions as string
+	 * @throws LopsException if LopsException occurs
+	 */
+	public String getInstructions(String input1, String output) throws LopsException {
+		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
+	}
+
+	/** Method should be overridden if needed
+	 * 
+	 * @param input1 input 1
+	 * @param input2 input 2
+	 * @param output output
+	 * @return instructions as string
+	 * @throws LopsException if LopsException occurs
+	 */
+	public String getInstructions(String input1, String input2, String output) throws LopsException {
+		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
+	}
+	
 	/**
 	 * Method should be overridden if needed
 	 * 
@@ -478,6 +513,15 @@ public abstract class Lop
 	public String getInstructions(String input1, String input2, String input3, String input4, String input5, String input6, String output) throws LopsException {
 		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
 	}
+
+	public String getInstructions(String input1, String input2, String input3, String input4, String input5, String input6, String input7, String output) throws LopsException {
+		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
+	}
+	
+	public String getInstructions(String[] inputs, String outputs) throws LopsException {
+		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
+	}
+	
 	
 	public String getInstructions(int output_index) throws LopsException {
 		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass. Lop Type: " + this.getType());
@@ -541,38 +585,6 @@ public abstract class Lop
 		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
 	}
 
-	/** Method should be overridden if needed
-	 * 
-	 * @param input1 input 1
-	 * @param input2 input 2
-	 * @param output output
-	 * @return instructions as string
-	 * @throws LopsException if LopsException occurs
-	 */
-	public String getInstructions(String input1, String input2, String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-
-	/** Method should be overridden if needed
-	 * 
-	 * @param input1 input 1
-	 * @param output output
-	 * @return instructions as string
-	 * @throws LopsException if LopsException occurs
-	 */
-	public String getInstructions(String input1, String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-
-	/** Method should be overridden if needed
-	 * 
-	 * @param output output
-	 * @return instructions as string
-	 * @throws LopsException if LopsException occurs
-	 */
-	public String getInstructions(String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
 
 	/** Method should be overridden if needed
 	 * 
@@ -630,37 +642,6 @@ public abstract class Lop
 		return "ERROR: line " + _beginLine + ", column " + _beginColumn + " -- ";
 	}
 
-	//TODO: Leo This might get confused with Rand.getInstructions
-	public String getInstructions(String input, String rowl, String rowu,
-			String coll, String colu, String leftRowDim,
-			String leftColDim, String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-	
-	// stride1, stride2, padding1, padding2  
-	// input_shape1, input_shape2, input_shape3, input_shape4, 
-	// filter_shape1, filter_shape2, filter_shape3, filter_shape4,
-	public String getInstructions(String input, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-	
-	public String getInstructions(String input, String dout, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-	
-	public String getInstructions(String input, String bias, String dout, String stride1, String stride2, String padding1, String padding2, 
-			String input_shape1, String input_shape2, String input_shape3, String input_shape4,
-			String filter_shape1, String filter_shape2, String filter_shape3, String filter_shape4,
-			String output) throws LopsException {
-		throw new LopsException(this.printErrorLocation() + "Should never be invoked in Baseclass");
-	}
-	
 	public String getInstructions(int input, int rowl, int rowu,
 			int coll, int colu, int leftRowDim,
 			int leftColDim, int output) throws LopsException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/lops/SpoofFused.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/SpoofFused.java b/src/main/java/org/apache/sysml/lops/SpoofFused.java
new file mode 100644
index 0000000..3f0ec59
--- /dev/null
+++ b/src/main/java/org/apache/sysml/lops/SpoofFused.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.lops;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.lops.LopProperties.ExecLocation;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.compile.JobType;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.parser.Expression.ValueType;
+
+public class SpoofFused extends Lop
+{
+	private final Class<?> _class;
+	private final int _numThreads;
+	
+	public SpoofFused( ArrayList<Lop> inputs, DataType dt, ValueType vt, Class<?> cla, int k, ExecType etype) {
+		super(Type.SpoofFused, dt, vt);
+		_class = cla;
+		_numThreads = k;
+		
+		for( Lop lop : inputs ) {
+			addInput(lop);
+			lop.addOutput(this);
+		}
+		
+		lps.addCompatibility(JobType.INVALID);
+		lps.setProperties( inputs, etype, ExecLocation.ControlProgram, false, false, false );
+	}
+
+	@Override
+	public String toString() {
+		return "spoof("+_class.getSimpleName()+")";
+	}
+
+	@Override
+	public String getInstructions(String input1, String output) throws LopsException {
+		return getInstructions(new String[]{input1}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String input3, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2, input3}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String input3, String input4, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2, input3, input4}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String input3, String input4, String input5, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2, input3, input4, input5}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String input3, String input4, String input5, String input6, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2, input3, input4, input5, input6}, new String[]{output});	
+	}
+	
+	@Override
+	public String getInstructions(String input1, String input2, String input3, String input4, String input5, String input6, String input7, String output) throws LopsException {
+		return getInstructions(new String[]{input1, input2, input3, input4, input5, input6, input7}, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String[] inputs, String output) throws LopsException {
+		return getInstructions(inputs, new String[]{output});
+	}
+	
+	@Override
+	public String getInstructions(String[] inputs, String[] outputs) 
+		throws LopsException
+	{
+		StringBuilder sb = new StringBuilder();
+		sb.append( getExecType() );
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( "spoof" );
+
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( _class.getName() );
+		
+		for(int i=0; i < inputs.length; i++) {
+			sb.append( OPERAND_DELIMITOR );
+			sb.append( getInputs().get(i).prepInputOperand(inputs[i]));
+		}
+		
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( prepOutputOperand(outputs[0]) );
+	
+		sb.append( OPERAND_DELIMITOR );
+		sb.append( _numThreads );
+		
+		return sb.toString();
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/lops/compile/Dag.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/compile/Dag.java b/src/main/java/org/apache/sysml/lops/compile/Dag.java
index 898f4ec..b513951 100644
--- a/src/main/java/org/apache/sysml/lops/compile/Dag.java
+++ b/src/main/java/org/apache/sysml/lops/compile/Dag.java
@@ -1491,65 +1491,12 @@ public class Dag<N extends Lop>
 								node.getInputs().get(6).getOutputParameters().getLabel(),
 								node.getOutputParameters().getLabel());
 					}
-					else if (node.getInputs().size() == 13) {
-						 // Used for im2col and reshape_col
-						 inst_string = node.getInstructions(
-						 		node.getInputs().get(0).getOutputParameters().getLabel(),
-						 		node.getInputs().get(1).getOutputParameters().getLabel(),
-						 		node.getInputs().get(2).getOutputParameters().getLabel(),
-						 		node.getInputs().get(3).getOutputParameters().getLabel(),
-						 		node.getInputs().get(4).getOutputParameters().getLabel(),
-						 		node.getInputs().get(5).getOutputParameters().getLabel(),
-						 		node.getInputs().get(6).getOutputParameters().getLabel(),
-						 		node.getInputs().get(7).getOutputParameters().getLabel(),
-						 		node.getInputs().get(8).getOutputParameters().getLabel(),
-						 		node.getInputs().get(9).getOutputParameters().getLabel(),
-						 		node.getInputs().get(10).getOutputParameters().getLabel(),
-						 		node.getInputs().get(11).getOutputParameters().getLabel(),
-						 		node.getInputs().get(12).getOutputParameters().getLabel(),
-						 		node.getOutputParameters().getLabel());
-					}
-					else if (node.getInputs().size() == 14) {
-						 // Used for pooling_backward
-						 inst_string = node.getInstructions(
-						 		node.getInputs().get(0).getOutputParameters().getLabel(),
-						 		node.getInputs().get(1).getOutputParameters().getLabel(),
-						 		node.getInputs().get(2).getOutputParameters().getLabel(),
-						 		node.getInputs().get(3).getOutputParameters().getLabel(),
-						 		node.getInputs().get(4).getOutputParameters().getLabel(),
-						 		node.getInputs().get(5).getOutputParameters().getLabel(),
-						 		node.getInputs().get(6).getOutputParameters().getLabel(),
-						 		node.getInputs().get(7).getOutputParameters().getLabel(),
-						 		node.getInputs().get(8).getOutputParameters().getLabel(),
-						 		node.getInputs().get(9).getOutputParameters().getLabel(),
-						 		node.getInputs().get(10).getOutputParameters().getLabel(),
-						 		node.getInputs().get(11).getOutputParameters().getLabel(),
-						 		node.getInputs().get(12).getOutputParameters().getLabel(),
-						 		node.getInputs().get(13).getOutputParameters().getLabel(),
-						 		node.getOutputParameters().getLabel());
-					}
-					else if (node.getInputs().size() == 15) {
-						 // Used for fused conv2d_bias_add
-						 inst_string = node.getInstructions(
-						 		node.getInputs().get(0).getOutputParameters().getLabel(),
-						 		node.getInputs().get(1).getOutputParameters().getLabel(),
-						 		node.getInputs().get(2).getOutputParameters().getLabel(),
-						 		node.getInputs().get(3).getOutputParameters().getLabel(),
-						 		node.getInputs().get(4).getOutputParameters().getLabel(),
-						 		node.getInputs().get(5).getOutputParameters().getLabel(),
-						 		node.getInputs().get(6).getOutputParameters().getLabel(),
-						 		node.getInputs().get(7).getOutputParameters().getLabel(),
-						 		node.getInputs().get(8).getOutputParameters().getLabel(),
-						 		node.getInputs().get(9).getOutputParameters().getLabel(),
-						 		node.getInputs().get(10).getOutputParameters().getLabel(),
-						 		node.getInputs().get(11).getOutputParameters().getLabel(),
-						 		node.getInputs().get(12).getOutputParameters().getLabel(),
-						 		node.getInputs().get(13).getOutputParameters().getLabel(),
-						 		node.getInputs().get(14).getOutputParameters().getLabel(),
-						 		node.getOutputParameters().getLabel());
-					}
 					else {
-						throw new LopsException(node.printErrorLocation() + "Node with " + node.getInputs().size() + " inputs is not supported in CP yet! \n");
+						String[] inputs = new String[node.getInputs().size()];
+						for( int j=0; j<node.getInputs().size(); j++ )
+							inputs[j] = node.getInputs().get(j).getOutputParameters().getLabel();
+						inst_string = node.getInstructions(inputs,
+								node.getOutputParameters().getLabel());
 					}
 				}
 				

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/ByteClassLoader.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/ByteClassLoader.java b/src/main/java/org/apache/sysml/runtime/codegen/ByteClassLoader.java
new file mode 100644
index 0000000..27263d3
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/ByteClassLoader.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.net.URL;
+import java.net.URLClassLoader;
+
+public class ByteClassLoader extends URLClassLoader 
+{
+	private final byte[] _classBytes;
+	
+	public ByteClassLoader(URL[] urls, ClassLoader parent, byte[] classBytes) {
+		super(urls, parent);
+		_classBytes = classBytes;
+	}
+	
+	@Override
+	public Class<?> findClass(String className) throws ClassNotFoundException {
+		if (_classBytes != null)
+			return defineClass(className, _classBytes, 0, _classBytes.length); 
+		return super.loadClass(className);
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/runtime/codegen/CodegenUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/CodegenUtils.java b/src/main/java/org/apache/sysml/runtime/codegen/CodegenUtils.java
new file mode 100644
index 0000000..fdad9bd
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/CodegenUtils.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectOutputStream;
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+
+import javax.tools.Diagnostic;
+import javax.tools.Diagnostic.Kind;
+import javax.tools.DiagnosticCollector;
+import javax.tools.JavaCompiler;
+import javax.tools.JavaCompiler.CompilationTask;
+import javax.tools.JavaFileObject;
+import javax.tools.StandardJavaFileManager;
+import javax.tools.ToolProvider;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.io.IOUtilFunctions;
+import org.apache.sysml.runtime.util.LocalFileUtils;
+import org.apache.sysml.utils.Statistics;
+
+public class CodegenUtils 
+{
+	//cache to reuse compiled and loaded classes (this is also a workaround for classes,
+	//compiled during initial compilation and subsequently loaded as the working directory
+	//is cleaned up just before the actual execution
+	private static ConcurrentHashMap<String, Class<?>> _cache = new ConcurrentHashMap<String,Class<?>>();
+	private static String _workingDir = null;
+	
+	public static Class<?> compileClass(String name, String src) 
+		throws DMLRuntimeException
+	{
+		//reuse existing compiled class
+		Class<?> ret = _cache.get(name);
+		if( ret != null ) 
+			return ret;
+		
+		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+		
+		try
+		{
+			//create working dir on demand
+			if( _workingDir == null )
+				createWorkingDir();
+			
+			//write input file (for debugging / classpath handling)
+			File ftmp = new File(_workingDir+"/codegen/"+name+".java");
+			if( !ftmp.getParentFile().exists() )
+				ftmp.getParentFile().mkdirs();
+			LocalFileUtils.writeTextFile(ftmp, src);
+			
+			//get system java compiler
+			JavaCompiler compiler = ToolProvider.getSystemJavaCompiler();
+			if( compiler == null )
+				throw new RuntimeException("Unable to obtain system java compiler.");
+		
+			//prepare file manager
+			DiagnosticCollector<JavaFileObject> diagnostics = new DiagnosticCollector<JavaFileObject>(); 
+			StandardJavaFileManager fileManager = compiler.getStandardFileManager(diagnostics, null, null);
+			
+			//prepare input source code
+			Iterable<? extends JavaFileObject> sources = fileManager
+					.getJavaFileObjectsFromFiles(Arrays.asList(ftmp));
+			
+			//prepare class path 
+			URL runDir = CodegenUtils.class.getProtectionDomain().getCodeSource().getLocation(); 
+			String classpath = System.getProperty("java.class.path") + 
+					File.pathSeparator + runDir.getPath();
+			List<String> options = Arrays.asList("-classpath",classpath);
+			
+			//compile source code
+			CompilationTask task = compiler.getTask(null, fileManager, diagnostics, options, null, sources);
+			Boolean success = task.call();
+			
+			//output diagnostics and error handling
+			for(Diagnostic<? extends JavaFileObject> tmp : diagnostics.getDiagnostics())
+				if( tmp.getKind()==Kind.ERROR )
+					System.err.println("ERROR: "+tmp.toString());				
+			if( success == null || !success )
+				throw new RuntimeException("Failed to compile class "+name);
+			
+			//dynamically load compiled class
+			URLClassLoader classLoader = new URLClassLoader(
+					new URL[]{new File(_workingDir).toURI().toURL(), runDir}, 
+					CodegenUtils.class.getClassLoader());
+			ret = classLoader.loadClass("codegen."+name);
+			classLoader.close();
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}
+		
+		//keep compiled class for reuse
+		_cache.put(name, ret);
+		
+		if( DMLScript.STATISTICS ) {
+			Statistics.incrementCodegenClassCompile();
+			Statistics.incrementCodegenClassCompileTime(System.nanoTime()-t0);
+		}
+		
+		return ret;
+	}
+	
+	public static Class<?> loadClass(String name, byte[] classBytes) throws DMLRuntimeException {
+		//reuse existing compiled class
+		Class<?> ret = _cache.get(name);
+		if( ret != null ) 
+			return ret;
+		
+		//define class using the bytes
+		if(classBytes != null)
+		{
+			//ByteClassLoader byteLoader = new ByteClassLoader(classLoader.getURLs() , classLoader.getParent(), classBytes);
+			try {
+				ByteClassLoader byteLoader = new ByteClassLoader(new URL[]{} ,CodegenUtils.class.getClassLoader(), classBytes);
+				ret = byteLoader.findClass(name);
+				byteLoader.close();
+			} catch (Exception e) {
+				throw new DMLRuntimeException(e);
+			}
+		}
+		else
+		{
+			//dynamically load compiled class
+			URL runDir = CodegenUtils.class.getProtectionDomain().getCodeSource().getLocation(); 
+			URLClassLoader classLoader = null;
+			try {
+				classLoader = new URLClassLoader(
+						new URL[]{new File(_workingDir).toURI().toURL(), runDir}, 
+						CodegenUtils.class.getClassLoader());
+				ret = classLoader.loadClass(name);
+			} 
+			catch (Exception e) {
+				throw new DMLRuntimeException(e);
+			}
+			finally {
+				IOUtilFunctions.closeSilently(classLoader);
+			}
+		}
+		
+		//keep loaded class for reuse
+		_cache.put(name, ret);
+		return ret;
+	}
+	
+	public static Object createInstance(Class<?> cla) 
+		throws DMLRuntimeException 
+	{
+		Object ret = null;
+		
+		try {
+			ret = cla.newInstance();	
+		}
+		catch( Exception ex ) {
+			throw new DMLRuntimeException(ex);
+		}
+		
+		return ret;
+	}
+	
+	public static byte[] getClassAsByteArray(String name) 
+		throws DMLRuntimeException
+	{
+		//reuse existing compiled class
+		Class<?> cls = _cache.get(name);
+		if( cls != null ) 
+			return getClassAsByteArray(cls);
+		
+		
+		String classAsPath = name.replace('.', '/') + ".class";
+		
+		URLClassLoader classLoader = null;
+		byte[] ret = null;
+		
+		try {
+			//dynamically load compiled class
+			URL runDir = CodegenUtils.class.getProtectionDomain().getCodeSource().getLocation(); 
+			classLoader = new URLClassLoader(
+					new URL[]{new File(_workingDir).toURI().toURL(), runDir}, 
+					CodegenUtils.class.getClassLoader());
+			InputStream stream = classLoader.getResourceAsStream(classAsPath);
+			ret = IOUtils.toByteArray(stream);
+		} 
+		catch (IOException e) {
+			throw new DMLRuntimeException(e);
+		}
+		finally {
+			IOUtilFunctions.closeSilently(classLoader);
+		}
+		
+		return ret;
+	}
+
+
+	public static byte[] getClassAsByteArray(Class<?> cls) 
+		throws DMLRuntimeException 
+	{
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		try {
+			ObjectOutputStream oos = new ObjectOutputStream(bos);
+			oos.writeObject(cls);
+			oos.flush();
+			return bos.toByteArray();
+		} 
+		catch( IOException e ) {
+			throw new DMLRuntimeException(e);
+		} 
+		finally {
+			IOUtilFunctions.closeSilently(bos);
+		}
+	}
+	
+	private static void createWorkingDir() throws DMLRuntimeException  {
+		if( _workingDir != null )
+			return;
+		String tmp = LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_CODEGEN);
+		LocalFileUtils.createLocalFileIfNotExist(tmp);
+		_workingDir = tmp;
+	}
+	
+	public static URL[] getUrls() throws DMLRuntimeException {
+		try {
+			URL runDir = CodegenUtils.class.getProtectionDomain().getCodeSource().getLocation(); 
+			return new URL[]{new File(_workingDir).toURI().toURL(), runDir};
+		}
+		catch(Exception e) {
+			throw new DMLRuntimeException(e);
+		}				
+	}
+	
+	public static String getSpoofType(Class<?> cls) {
+		if(cls.getSuperclass() == SpoofCellwise.class)
+			return "Cell" +  cls.getName().split("\\.")[1];
+		else if(cls.getSuperclass() == SpoofOuterProduct.class)
+			return "OP" +  cls.getName().split("\\.")[1];
+		else if(cls.getSuperclass() == SpoofRowAggregate.class)
+			return "RA" +  cls.getName().split("\\.")[1];
+		else
+			return "UNKNOWN";
+	}
+}

[9/9] incubator-systemml git commit: [SYSTEMML-1286] Code generator compiler integration, incl tests

Posted by mb...@apache.org.

[SYSTEMML-1286] Code generator compiler integration, incl tests

This patch fully integrates the new code generator into SystemML's
compilation chain including dynamic recompilation. Note that this does
not yet apply to MLContext, JMLC and other replicated instances of our
compilation chain - however, SYSTEMML-1325 will anyway consolidate this.

Furthermore, this also introduces various function and application
tests, including algorithms that were not contained in our testsuite so
far (e.g., KMeans, Mlogreg, and PNMF).


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bbefe96b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bbefe96b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bbefe96b

Branch: refs/heads/master
Commit: bbefe96b263f697eb3f4e0297379840930356c0d
Parents: 982ecb1
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Feb 26 19:12:50 2017 -0800
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Feb 26 20:42:51 2017 -0800

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   12 +
 .../java/org/apache/sysml/api/DMLScript.java    |   15 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |   13 +-
 .../org/apache/sysml/hops/OptimizerUtils.java   |   17 +-
 .../apache/sysml/hops/recompile/Recompiler.java |   14 +
 .../RewriteAlgebraicSimplificationDynamic.java  |   25 +-
 .../RewriteAlgebraicSimplificationStatic.java   |   15 +-
 .../org/apache/sysml/parser/DMLTranslator.java  |    6 +
 .../test/integration/AutomatedTestBase.java     |    7 +
 .../functions/codegen/AlgorithmGLM.java         |  194 ++++
 .../functions/codegen/AlgorithmKMeans.java      |  192 ++++
 .../functions/codegen/AlgorithmL2SVM.java       |  157 +++
 .../functions/codegen/AlgorithmLinregCG.java    |  149 +++
 .../functions/codegen/AlgorithmMLogreg.java     |  197 ++++
 .../functions/codegen/AlgorithmMSVM.java        |  157 +++
 .../functions/codegen/AlgorithmPNMF.java        |  142 +++
 .../functions/codegen/CellwiseTmplTest.java     |  183 +++
 .../functions/codegen/DAGCellwiseTmplTest.java  |  161 +++
 .../functions/codegen/OuterProdTmplTest.java    |  259 +++++
 .../functions/codegen/RowAggTmplTest.java       |  142 +++
 .../scripts/functions/codegen/Algorithm_GLM.R   | 1081 ++++++++++++++++++
 .../scripts/functions/codegen/Algorithm_GLM.dml | 1053 +++++++++++++++++
 .../functions/codegen/Algorithm_Kmeans.dml      |  243 ++++
 .../scripts/functions/codegen/Algorithm_L2SVM.R |   98 ++
 .../functions/codegen/Algorithm_L2SVM.dml       |  106 ++
 .../functions/codegen/Algorithm_LinregCG.R      |   57 +
 .../functions/codegen/Algorithm_LinregCG.dml    |   56 +
 .../functions/codegen/Algorithm_MLogreg.R       |  278 +++++
 .../functions/codegen/Algorithm_MLogreg.dml     |  274 +++++
 .../scripts/functions/codegen/Algorithm_MSVM.R  |  133 +++
 .../functions/codegen/Algorithm_MSVM.dml        |  150 +++
 .../scripts/functions/codegen/Algorithm_PNMF.R  |   43 +
 .../functions/codegen/Algorithm_PNMF.dml        |   40 +
 .../functions/codegen/DAGcellwisetmpl1.R        |   36 +
 .../functions/codegen/DAGcellwisetmpl1.dml      |   31 +
 .../functions/codegen/DAGcellwisetmpl2.R        |   36 +
 .../functions/codegen/DAGcellwisetmpl2.dml      |   31 +
 .../functions/codegen/DAGcellwisetmpl3.R        |   36 +
 .../functions/codegen/DAGcellwisetmpl3.dml      |   31 +
 .../codegen/SystemML-config-codegen.xml         |   61 +
 .../scripts/functions/codegen/cellwisetmpl1.R   |   43 +
 .../scripts/functions/codegen/cellwisetmpl1.dml |   27 +
 .../scripts/functions/codegen/cellwisetmpl2.R   |   31 +
 .../scripts/functions/codegen/cellwisetmpl2.dml |   28 +
 .../scripts/functions/codegen/cellwisetmpl3.R   |   31 +
 .../scripts/functions/codegen/cellwisetmpl3.dml |   24 +
 .../scripts/functions/codegen/cellwisetmpl4.R   |   32 +
 .../scripts/functions/codegen/cellwisetmpl4.dml |   26 +
 .../scripts/functions/codegen/cellwisetmpl5.R   |   34 +
 .../scripts/functions/codegen/cellwisetmpl5.dml |   29 +
 .../scripts/functions/codegen/cellwisetmpl6.R   |   33 +
 .../scripts/functions/codegen/cellwisetmpl6.dml |   54 +
 .../functions/codegen/codegenIntegration.R      |   45 +
 .../functions/codegen/codegenIntegration.dml    |   67 ++
 .../scripts/functions/codegen/rowAggPattern1.R  |   29 +
 .../functions/codegen/rowAggPattern1.dml        |   26 +
 .../scripts/functions/codegen/rowAggPattern2.R  |   31 +
 .../functions/codegen/rowAggPattern2.dml        |   30 +
 .../scripts/functions/codegen/rowAggPattern3.R  |   31 +
 .../functions/codegen/rowAggPattern3.dml        |   30 +
 .../scripts/functions/codegen/rowAggPattern4.R  |   27 +
 .../functions/codegen/rowAggPattern4.dml        |   25 +
 src/test/scripts/functions/codegen/wcemm.R      |   35 +
 src/test/scripts/functions/codegen/wcemm.dml    |   30 +
 src/test/scripts/functions/codegen/wdivmm.R     |   32 +
 src/test/scripts/functions/codegen/wdivmm.dml   |   29 +
 .../scripts/functions/codegen/wdivmmRight.R     |   32 +
 .../scripts/functions/codegen/wdivmmRight.dml   |   32 +
 .../functions/codegen/wdivmmRightNotranspose.R  |   32 +
 .../codegen/wdivmmRightNotranspose.dml          |   31 +
 .../functions/codegen/wdivmmTransposeOut.R      |   32 +
 .../functions/codegen/wdivmmTransposeOut.dml    |   30 +
 .../scripts/functions/codegen/wdivmmbasic.R     |   32 +
 .../scripts/functions/codegen/wdivmmbasic.dml   |   30 +
 src/test/scripts/functions/codegen/wsigmoid.R   |   33 +
 src/test/scripts/functions/codegen/wsigmoid.dml |   30 +
 .../functions/codegen/ZPackageSuite.java        |   45 +
 77 files changed, 7131 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index 9fc2aef..da80039 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -53,4 +53,16 @@
    
    <!-- enables multi-threaded read/write of text formats in singlenode control program -->
    <cp.parallel.textio>true</cp.parallel.textio>
+   
+   <!-- enables compressed linear algebra, experimental feature -->
+   <compressed.linalg>false</compressed.linalg>
+   
+   <!-- enables operator fusion via code generation, experimental feature -->
+   <codegen.enabled>false</codegen.enabled>
+   
+   <!-- if codegen.enabled, enables source code caching of fused operators -->
+   <codegen.plancache>false</codegen.plancache>
+   
+   <!-- if codegen.enabled, compile literals as constants: 1..heuristic, 2..always -->
+   <codegen.literals>1</codegen.literals>
 </root>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java
index 83d0f5b..80c78c1 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -56,6 +56,7 @@ import org.apache.sysml.debug.DMLDebuggerProgramInfo;
 import org.apache.sysml.hops.HopsException;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.OptimizerUtils.OptimizationLevel;
+import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.hops.globalopt.GlobalOptimizerWrapper;
 import org.apache.sysml.lops.Lop;
 import org.apache.sysml.lops.LopsException;
@@ -606,6 +607,20 @@ public class DMLScript
 					 +"Memory Budget = " + ((double)OptimizerUtils.getLocalMemBudget()/1024/1024) + " MB" + "\n");
 		}
 
+		//Step 5.1: Generate code for the rewrited Hop dags 
+		if( dmlconf.getBooleanValue(DMLConfig.CODEGEN) ){
+			SpoofCompiler.USE_PLAN_CACHE = dmlconf.getBooleanValue(DMLConfig.CODEGEN_PLANCACHE);
+			SpoofCompiler.ALWAYS_COMPILE_LITERALS = (dmlconf.getIntValue(DMLConfig.CODEGEN_LITERALS)==2);
+			
+			dmlt.codgenHopsDAG(prog);
+			
+			if (LOG.isDebugEnabled()) {
+				LOG.debug("\n********************** HOPS DAG (After Codegen) *******************");
+				dmlt.printHops(prog);
+				
+			}
+		}
+		
 		//Step 6: construct lops (incl exec type and op selection)
 		dmlt.constructLops(prog);
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 922ba82..3d0fb28 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -67,10 +67,14 @@ public class DMLConfig
 	public static final String YARN_APPMASTER       = "dml.yarn.appmaster"; 	
 	public static final String YARN_APPMASTERMEM    = "dml.yarn.appmaster.mem"; 
 	public static final String YARN_MAPREDUCEMEM    = "dml.yarn.mapreduce.mem"; 
-	public static final String YARN_APPQUEUE    	= "dml.yarn.app.queue"; 
+	public static final String YARN_APPQUEUE        = "dml.yarn.app.queue"; 
 	public static final String CP_PARALLEL_MATRIXMULT = "cp.parallel.matrixmult";
 	public static final String CP_PARALLEL_TEXTIO   = "cp.parallel.textio";
 	public static final String COMPRESSED_LINALG    = "compressed.linalg";
+	public static final String CODEGEN              = "codegen.enabled"; //boolean
+	public static final String CODEGEN_PLANCACHE    = "codegen.plancache"; //boolean
+	public static final String CODEGEN_LITERALS     = "codegen.literals"; //1..heuristic, 2..always
+
 	// Fraction of available memory to use. The available memory is computer when the JCudaContext is created
 	// to handle the tradeoff on calling cudaMemGetInfo too often.
 	public static final String GPU_MEMORY_UTILIZATION_FACTOR    = "gpu.memory.util.factor";
@@ -107,6 +111,10 @@ public class DMLConfig
 		_defaultVals.put(CP_PARALLEL_MATRIXMULT, "true" );
 		_defaultVals.put(CP_PARALLEL_TEXTIO,     "true" );
 		_defaultVals.put(COMPRESSED_LINALG,      "false" );
+		_defaultVals.put(CODEGEN,                "false" );
+		_defaultVals.put(CODEGEN_PLANCACHE,      "true" );
+		_defaultVals.put(CODEGEN_LITERALS,       "1" );
+		
 		_defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
 		_defaultVals.put(REFRESH_AVAILABLE_MEMORY_EVERY_TIME,      "true" );
 	}
@@ -392,7 +400,8 @@ public class DMLConfig
 				LOCAL_TMP_DIR,SCRATCH_SPACE,OPTIMIZATION_LEVEL,
 				NUM_REDUCERS, DEFAULT_BLOCK_SIZE,
 				YARN_APPMASTER, YARN_APPMASTERMEM, YARN_MAPREDUCEMEM, 
-				CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO
+				CP_PARALLEL_MATRIXMULT, CP_PARALLEL_TEXTIO,
+				COMPRESSED_LINALG, CODEGEN, CODEGEN_LITERALS, CODEGEN_PLANCACHE,
 		}; 
 		
 		StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index 86b7968..6efd799 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -110,6 +110,7 @@ public class OptimizerUtils
 	public static boolean ALLOW_CONSTANT_FOLDING = true;
 	
 	public static boolean ALLOW_ALGEBRAIC_SIMPLIFICATION = true; 
+	public static boolean ALLOW_OPERATOR_FUSION = true; 
 	
 	/**
 	 * Enables if-else branch removal for constant predicates (original literals or 
@@ -272,7 +273,7 @@ public class OptimizerUtils
 
 		//handle optimization level
 		int optlevel = dmlconf.getIntValue(DMLConfig.OPTIMIZATION_LEVEL);
-		if( optlevel < 0 || optlevel > 5 )
+		if( optlevel < 0 || optlevel > 7 )
 			throw new DMLRuntimeException("Error: invalid optimization level '"+optlevel+"' (valid values: 0-5).");
 	
 		// This overrides any optimization level that is present in the configuration file.
@@ -336,6 +337,20 @@ public class OptimizerUtils
 				cconf.set(ConfigType.ALLOW_DYN_RECOMPILATION, false);
 				cconf.set(ConfigType.ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS, false);
 				break;
+			
+			// opt level 6 and7: SPOOF w/o fused operators, otherwise same as O2
+			// (hidden optimization levels not documented on purpose, as they will
+			// be removed once SPOOF is production ready)	
+			case 6:
+				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
+				ALLOW_AUTO_VECTORIZATION = false;
+				break;
+			case 7:				
+				cconf.set(ConfigType.OPT_LEVEL, OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT.ordinal());
+				ALLOW_OPERATOR_FUSION = false;
+				ALLOW_AUTO_VECTORIZATION = false;
+				ALLOW_SUM_PRODUCT_REWRITES = false;
+				break;	
 		}
 		
 		//handle parallel text io (incl awareness of thread contention in <jdk8)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java b/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
index 8b121d7..da13d0a 100644
--- a/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
+++ b/src/main/java/org/apache/sysml/hops/recompile/Recompiler.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.wink.json4j.JSONObject;
 import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.conf.CompilerConfig.ConfigType;
 import org.apache.sysml.hops.DataGenOp;
 import org.apache.sysml.hops.DataOp;
@@ -52,6 +53,7 @@ import org.apache.sysml.hops.MemoTable;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.ReorgOp;
 import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.rewrite.ProgramRewriter;
 import org.apache.sysml.lops.CSVReBlock;
@@ -210,6 +212,12 @@ public class Recompiler
 				hopRoot.refreshMemEstimates(memo); 
 			memo.extract(hops, status);
 			
+			// codegen if enabled
+			if( ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.CODEGEN) && SpoofCompiler.RECOMPILE ) {
+				Hop.resetVisitStatus(hops);
+				hops = SpoofCompiler.optimize(hops, SpoofCompiler.ALWAYS_COMPILE_LITERALS);
+			}
+			
 			// construct lops			
 			Dag<Lop> dag = new Dag<Lop>();
 			for( Hop hopRoot : hops ){
@@ -304,6 +312,12 @@ public class Recompiler
 			hops.resetVisitStatus();
 			hops.refreshMemEstimates(memo); 		
 			
+			// codegen if enabled
+			if( ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.CODEGEN) && SpoofCompiler.RECOMPILE ) {
+				hops.resetVisitStatus();
+				hops = SpoofCompiler.optimize(hops, false);
+			}
+			
 			// construct lops			
 			Dag<Lop> dag = new Dag<Lop>();
 			Lop lops = hops.constructLops();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index cc67cc1..20c1eeb 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -41,6 +41,7 @@ import org.apache.sysml.hops.HopsException;
 import org.apache.sysml.hops.IndexingOp;
 import org.apache.sysml.hops.LeftIndexingOp;
 import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.ReorgOp;
 import org.apache.sysml.hops.UnaryOp;
@@ -149,11 +150,13 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 			hi = removeUnnecessaryRightIndexing(hop, hi, i);  //e.g., X[,1] -> X, if output == input size 
 			hi = removeEmptyLeftIndexing(hop, hi, i);         //e.g., X[,1]=Y -> matrix(0,nrow(X),ncol(X)), if nnz(X)==0 and nnz(Y)==0 
 			hi = removeUnnecessaryLeftIndexing(hop, hi, i);   //e.g., X[,1]=Y -> Y, if output == input dims 
-			hi = fuseLeftIndexingChainToAppend(hop, hi, i);   //e.g., X[,1]=A; X[,2]=B -> X=cbind(A,B), iff ncol(X)==2 and col1/2 lix
+			if(OptimizerUtils.ALLOW_OPERATOR_FUSION)
+				hi = fuseLeftIndexingChainToAppend(hop, hi, i);   //e.g., X[,1]=A; X[,2]=B -> X=cbind(A,B), iff ncol(X)==2 and col1/2 lix
 			hi = removeUnnecessaryCumulativeOp(hop, hi, i);   //e.g., cumsum(X) -> X, if nrow(X)==1;
 			hi = removeUnnecessaryReorgOperation(hop, hi, i); //e.g., matrix(X) -> X, if dims(in)==dims(out); r(X)->X, if 1x1 dims
 			hi = removeUnnecessaryOuterProduct(hop, hi, i);   //e.g., X*(Y%*%matrix(1,...) -> X*Y, if Y col vector
-			hi = fuseDatagenAndReorgOperation(hop, hi, i);    //e.g., t(rand(rows=10,cols=1)) -> rand(rows=1,cols=10), if one dim=1
+			if(OptimizerUtils.ALLOW_OPERATOR_FUSION)
+				hi = fuseDatagenAndReorgOperation(hop, hi, i);    //e.g., t(rand(rows=10,cols=1)) -> rand(rows=1,cols=10), if one dim=1
 			hi = simplifyColwiseAggregate(hop, hi, i);        //e.g., colsums(X) -> sum(X) or X, if col/row vector
 			hi = simplifyRowwiseAggregate(hop, hi, i);        //e.g., rowsums(X) -> sum(X) or X, if row/col vector
 			hi = simplifyColSumsMVMult(hop, hi, i);           //e.g., colSums(X*Y) -> t(Y) %*% X, if Y col vector
@@ -171,14 +174,16 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 			hi = simplifySumDiagToTrace(hi);                  //e.g., sum(diag(X)) -> trace(X); if col vector
 			hi = pushdownBinaryOperationOnDiag(hop, hi, i);   //e.g., diag(X)*7 -> diag(X*7); if col vector
 			hi = pushdownSumOnAdditiveBinary(hop, hi, i);     //e.g., sum(A+B) -> sum(A)+sum(B); if dims(A)==dims(B)
-			hi = simplifyWeightedSquaredLoss(hop, hi, i);     //e.g., sum(W * (X - U %*% t(V)) ^ 2) -> wsl(X, U, t(V), W, true), 
-			hi = simplifyWeightedSigmoidMMChains(hop, hi, i); //e.g., W * sigmoid(Y%*%t(X)) -> wsigmoid(W, Y, t(X), type)
-			hi = simplifyWeightedDivMM(hop, hi, i);           //e.g., t(U) %*% (X/(U%*%t(V))) -> wdivmm(X, U, t(V), left)
-			hi = simplifyWeightedCrossEntropy(hop, hi, i);    //e.g., sum(X*log(U%*%t(V))) -> wcemm(X, U, t(V))
-			hi = simplifyWeightedUnaryMM(hop, hi, i);         //e.g., X*exp(U%*%t(V)) -> wumm(X, U, t(V), exp)
-			hi = simplifyDotProductSum(hop, hi, i);           //e.g., sum(v^2) -> t(v)%*%v if ncol(v)==1 
-			hi = fuseSumSquared(hop, hi, i);                  //e.g., sum(X^2) -> sumSq(X), if ncol(X)>1
-			hi = fuseAxpyBinaryOperationChain(hop, hi, i);    //e.g., (X+s*Y) -> (X+*s Y), (X-s*Y) -> (X-*s Y) 	
+			if(OptimizerUtils.ALLOW_OPERATOR_FUSION) {
+				hi = simplifyWeightedSquaredLoss(hop, hi, i);     //e.g., sum(W * (X - U %*% t(V)) ^ 2) -> wsl(X, U, t(V), W, true), 
+				hi = simplifyWeightedSigmoidMMChains(hop, hi, i); //e.g., W * sigmoid(Y%*%t(X)) -> wsigmoid(W, Y, t(X), type)
+				hi = simplifyWeightedDivMM(hop, hi, i);           //e.g., t(U) %*% (X/(U%*%t(V))) -> wdivmm(X, U, t(V), left)
+				hi = simplifyWeightedCrossEntropy(hop, hi, i);    //e.g., sum(X*log(U%*%t(V))) -> wcemm(X, U, t(V))
+				hi = simplifyWeightedUnaryMM(hop, hi, i);         //e.g., X*exp(U%*%t(V)) -> wumm(X, U, t(V), exp)
+				hi = simplifyDotProductSum(hop, hi, i);           //e.g., sum(v^2) -> t(v)%*%v if ncol(v)==1 
+				hi = fuseSumSquared(hop, hi, i);                  //e.g., sum(X^2) -> sumSq(X), if ncol(X)>1
+				hi = fuseAxpyBinaryOperationChain(hop, hi, i);    //e.g., (X+s*Y) -> (X+*s Y), (X-s*Y) -> (X-*s Y) 	
+			}
 			hi = reorderMinusMatrixMult(hop, hi, i);          //e.g., (-t(X))%*%y->-(t(X)%*%y), TODO size
 			hi = simplifySumMatrixMult(hop, hi, i);           //e.g., sum(A%*%B) -> sum(t(colSums(A))*rowSums(B)), if not dot product / wsloss
 			hi = simplifyEmptyBinaryOperation(hop, hi, i);    //e.g., X*Y -> matrix(0,nrow(X), ncol(X)) / X+Y->X / X-Y -> X

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationStatic.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationStatic.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationStatic.java
index 2ae27c8..5e97829 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationStatic.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationStatic.java
@@ -42,6 +42,7 @@ import org.apache.sysml.hops.Hop.ParamBuiltinOp;
 import org.apache.sysml.hops.Hop.ReOrgOp;
 import org.apache.sysml.hops.HopsException;
 import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.ParameterizedBuiltinOp;
 import org.apache.sysml.hops.ReorgOp;
@@ -142,7 +143,8 @@ public class RewriteAlgebraicSimplificationStatic extends HopRewriteRule
  			hi = simplifyBinaryToUnaryOperation(hop, hi, i);     //e.g., X*X -> X^2 (pow2), X+X -> X*2, (X>0)-(X<0) -> sign(X)
  			hi = canonicalizeMatrixMultScalarAdd(hi);            //e.g., eps+U%*%t(V) -> U%*%t(V)+eps, U%*%t(V)-eps -> U%*%t(V)+(-eps) 
  			hi = simplifyReverseOperation(hop, hi, i);           //e.g., table(seq(1,nrow(X),1),seq(nrow(X),1,-1)) %*% X -> rev(X)
-			hi = simplifyMultiBinaryToBinaryOperation(hi);       //e.g., 1-X*Y -> X 1-* Y
+ 			if(OptimizerUtils.ALLOW_OPERATOR_FUSION)
+ 				hi = simplifyMultiBinaryToBinaryOperation(hi);       //e.g., 1-X*Y -> X 1-* Y
  			hi = simplifyDistributiveBinaryOperation(hop, hi, i);//e.g., (X-Y*X) -> (1-Y)*X
  			hi = simplifyBushyBinaryOperation(hop, hi, i);       //e.g., (X*(Y*(Z%*%v))) -> (X*Y)*(Z%*%v)
  			hi = simplifyUnaryAggReorgOperation(hop, hi, i);     //e.g., sum(t(X)) -> sum(X)
@@ -152,7 +154,8 @@ public class RewriteAlgebraicSimplificationStatic extends HopRewriteRule
  			hi = pushdownSumBinaryMult(hop, hi, i);              //e.g., sum(lamda*X) -> lamda*sum(X)
  			hi = simplifyUnaryPPredOperation(hop, hi, i);        //e.g., abs(ppred()) -> ppred(), others: round, ceil, floor
  			hi = simplifyTransposedAppend(hop, hi, i);           //e.g., t(cbind(t(A),t(B))) -> rbind(A,B);
- 			hi = fuseBinarySubDAGToUnaryOperation(hop, hi, i);   //e.g., X*(1-X)-> sprop(X) || 1/(1+exp(-X)) -> sigmoid(X) || X*(X>0) -> selp(X)
+ 			if(OptimizerUtils.ALLOW_OPERATOR_FUSION)
+ 				hi = fuseBinarySubDAGToUnaryOperation(hop, hi, i);   //e.g., X*(1-X)-> sprop(X) || 1/(1+exp(-X)) -> sigmoid(X) || X*(X>0) -> selp(X)
 			hi = simplifyTraceMatrixMult(hop, hi, i);            //e.g., trace(X%*%Y)->sum(X*t(Y));  
 			hi = simplifySlicedMatrixMult(hop, hi, i);           //e.g., (X%*%Y)[1,1] -> X[1,] %*% Y[,1];
 			hi = simplifyConstantSort(hop, hi, i);               //e.g., order(matrix())->matrix/seq; 
@@ -161,9 +164,11 @@ public class RewriteAlgebraicSimplificationStatic extends HopRewriteRule
 			hi = simplifyTransposeAggBinBinaryChains(hop, hi, i);//e.g., t(t(A)%*%t(B)+C) -> B%*%A+t(C)
 			hi = removeUnnecessaryMinus(hop, hi, i);             //e.g., -(-X)->X; potentially introduced by simplfiy binary or dyn rewrites
 			hi = simplifyGroupedAggregate(hi);          	     //e.g., aggregate(target=X,groups=y,fn="count") -> aggregate(target=y,groups=y,fn="count")
-			hi = fuseMinusNzBinaryOperation(hop, hi, i);         //e.g., X-mean*ppred(X,0,!=) -> X -nz mean
-			hi = fuseLogNzUnaryOperation(hop, hi, i);            //e.g., ppred(X,0,"!=")*log(X) -> log_nz(X)
-			hi = fuseLogNzBinaryOperation(hop, hi, i);           //e.g., ppred(X,0,"!=")*log(X,0.5) -> log_nz(X,0.5)
+			if(OptimizerUtils.ALLOW_OPERATOR_FUSION) {
+				hi = fuseMinusNzBinaryOperation(hop, hi, i);         //e.g., X-mean*ppred(X,0,!=) -> X -nz mean
+				hi = fuseLogNzUnaryOperation(hop, hi, i);            //e.g., ppred(X,0,"!=")*log(X) -> log_nz(X)
+				hi = fuseLogNzBinaryOperation(hop, hi, i);           //e.g., ppred(X,0,"!=")*log(X,0.5) -> log_nz(X,0.5)
+			}
 			hi = simplifyOuterSeqExpand(hop, hi, i);             //e.g., outer(v, seq(1,m), "==") -> rexpand(v, max=m, dir=row, ignore=true, cast=false)
 			hi = simplifyTableSeqExpand(hop, hi, i);             //e.g., table(seq(1,nrow(v)), v, nrow(v), m) -> rexpand(v, max=m, dir=row, ignore=false, cast=true)
 			//hi = removeUnecessaryPPred(hop, hi, i);            //e.g., ppred(X,X,"==")->matrix(1,rows=nrow(X),cols=ncol(X))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index 0063997..61dff7c 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -56,6 +56,7 @@ import org.apache.sysml.hops.ParameterizedBuiltinOp;
 import org.apache.sysml.hops.ReorgOp;
 import org.apache.sysml.hops.TernaryOp;
 import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.hops.ipa.InterProceduralAnalysis;
 import org.apache.sysml.hops.recompile.Recompiler;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
@@ -278,6 +279,11 @@ public class DMLTranslator
 		resetHopsDAGVisitStatus(dmlp);
 	}
 	
+	public void codgenHopsDAG(DMLProgram dmlp) 
+		throws LanguageException, HopsException, DMLRuntimeException 
+	{
+		SpoofCompiler.generateCode(dmlp);	
+	}
 	
 	public void constructLops(DMLProgram dmlp) throws ParseException, LanguageException, HopsException, LopsException {
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
index e5ed921..e6123ef 100644
--- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
@@ -1774,4 +1774,11 @@ public abstract class AutomatedTestBase
 	{
 		return writeInputFrame(name, data, false, schema, oi);
 	}
+	
+	protected boolean heavyHittersContainsSubString(String str) {
+		for( String opcode : Statistics.getCPHeavyHitterOpCodes())
+			if(opcode.contains(str))
+				return true;
+		return false;		
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmGLM.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmGLM.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmGLM.java
new file mode 100644
index 0000000..1dc8e1f
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmGLM.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmGLM extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_GLM";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmGLM.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	//private final static double eps = 1e-5;
+	
+	private final static int rows = 2468;
+	private final static int cols = 1007;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static int intercept = 0;
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 5; //inner/outer
+	
+	public enum GLMType {
+		POISSON_LOG,
+		GAMMA_LOG,
+		BINOMIAL_PROBIT,
+	}
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testGLMPoissonDenseRewritesCP() {
+		runGLMTest(GLMType.POISSON_LOG, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMPoissonSparseRewritesCP() {
+		runGLMTest(GLMType.POISSON_LOG, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMPoissonDenseCP() {
+		runGLMTest(GLMType.POISSON_LOG, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMPoissonSparseCP() {
+		runGLMTest(GLMType.POISSON_LOG, false, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMGammaDenseRewritesCP() {
+		runGLMTest(GLMType.GAMMA_LOG, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMGammaSparseRewritesCP() {
+		runGLMTest(GLMType.GAMMA_LOG, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMGammaDenseCP() {
+		runGLMTest(GLMType.GAMMA_LOG, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMGammaSparseCP() {
+		runGLMTest(GLMType.GAMMA_LOG, false, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMBinomialDenseRewritesCP() {
+		runGLMTest(GLMType.BINOMIAL_PROBIT, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMBinomialSparseRewritesCP() {
+		runGLMTest(GLMType.BINOMIAL_PROBIT, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMBinomialDenseCP() {
+		runGLMTest(GLMType.BINOMIAL_PROBIT, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testGLMBinomialSparseCP() {
+		runGLMTest(GLMType.BINOMIAL_PROBIT, false, true, ExecType.CP);
+	}
+	
+	private void runGLMTest( GLMType type, boolean rewrites, boolean sparse, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = TEST_NAME1;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			String[] addArgs = new String[4];
+			switch(type) {
+				case POISSON_LOG: //dfam, vpow, link, vpow
+					addArgs[0] = "1"; addArgs[1] = "1.0"; addArgs[2] = "1"; addArgs[3] = "0.0";
+					break;
+				case GAMMA_LOG:   //dfam, vpow, link, vpow
+					addArgs[0] = "1"; addArgs[1] = "2.0"; addArgs[2] = "1"; addArgs[3] = "0.0";
+					break;
+				case BINOMIAL_PROBIT: //dfam, vpow, link, yneg 
+					addArgs[0] = "2"; addArgs[1] = "0.0"; addArgs[2] = "3"; addArgs[3] = "2";
+					break;
+			}
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("Y"),
+				String.valueOf(intercept), String.valueOf(epsilon), String.valueOf(maxiter), 
+				addArgs[0], addArgs[1], addArgs[2], addArgs[3], output("w")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+				String.valueOf(maxiter), addArgs[0], addArgs[1], addArgs[2], addArgs[3], expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] y = TestUtils.round(getRandomMatrix(rows, 1, 0, 1, 1.0, 136));
+			writeInputMatrixWithMTD("Y", y, true);
+			
+			runTest(true, false, null, -1); 
+			//TODO fix R glm script
+			//runRScript(true); 
+			
+			//compare matrices 
+			//HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("w");
+			//HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("w");
+			//TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmKMeans.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmKMeans.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmKMeans.java
new file mode 100644
index 0000000..907d0ca
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmKMeans.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmKMeans extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_KMeans";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmKMeans.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	//private final static double eps = 1e-5;
+	
+	private final static int rows = 3972;
+	private final static int cols = 972;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "C" })); 
+	}
+
+	@Test
+	public void testKMeansDenseBinSingleRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, false, 2, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseBinSingleRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, true, 2, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseBinSingleCP() {
+		runKMeansTest(TEST_NAME1, false, false, 2, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseBinSingleCP() {
+		runKMeansTest(TEST_NAME1, false, true, 2, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseBinMultiRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, false, 2, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseBinMultiRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, true, 2, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseBinMultiCP() {
+		runKMeansTest(TEST_NAME1, false, false, 2, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseBinMultiCP() {
+		runKMeansTest(TEST_NAME1, false, true, 2, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseMulSingleRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, false, 20, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseMulSingleRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, true, 20, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseMulSingleCP() {
+		runKMeansTest(TEST_NAME1, false, false, 20, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseMulSingleCP() {
+		runKMeansTest(TEST_NAME1, false, true, 20, 1, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseMulMultiRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, false, 20, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseMulMultiRewritesCP() {
+		runKMeansTest(TEST_NAME1, true, true, 20, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansDenseMulMultiCP() {
+		runKMeansTest(TEST_NAME1, false, false, 20, 10, ExecType.CP);
+	}
+	
+	@Test
+	public void testKMeansSparseMulMultiCP() {
+		runKMeansTest(TEST_NAME1, false, true, 20, 10, ExecType.CP);
+	}
+	
+	private void runKMeansTest( String testname, boolean rewrites, boolean sparse, int centroids, int runs, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "hops", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), String.valueOf(centroids),
+				String.valueOf(runs), String.valueOf(epsilon), String.valueOf(maxiter), output("C")};
+
+			//rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+			//	String.valueOf(maxiter), expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714);
+			writeInputMatrixWithMTD("X", X, true);
+			
+			runTest(true, false, null, -1); 
+			
+			//no comparison with R due to randomized algorithm
+			//runRScript(true); 
+			//compare matrices 
+			//HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("C");
+			//HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("C");
+			//TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmL2SVM.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmL2SVM.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmL2SVM.java
new file mode 100644
index 0000000..f93808b
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmL2SVM.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmL2SVM extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_L2SVM";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmL2SVM.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private final static double eps = 1e-5;
+	
+	private final static int rows = 1468;
+	private final static int cols = 1007;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static int intercept = 0;
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testL2SVMDenseRewritesCP() {
+		runL2SVMTest(TEST_NAME1, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testL2SVMSparseRewritesCP() {
+		runL2SVMTest(TEST_NAME1, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testL2SVMDenseCP() {
+		runL2SVMTest(TEST_NAME1, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testL2SVMSparseCP() {
+		runL2SVMTest(TEST_NAME1, false, true, ExecType.CP);
+	}
+
+	@Test
+	public void testL2SVMDenseRewritesSP() {
+		runL2SVMTest(TEST_NAME1, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testL2SVMSparseRewritesSP() {
+		runL2SVMTest(TEST_NAME1, true, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testL2SVMDenseSP() {
+		runL2SVMTest(TEST_NAME1, false, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testL2SVMSparseSP() {
+		runL2SVMTest(TEST_NAME1, false, true, ExecType.SPARK);
+	}
+	
+	private void runL2SVMTest( String testname, boolean rewrites, boolean sparse, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("Y"),
+				String.valueOf(intercept), String.valueOf(epsilon),
+				String.valueOf(maxiter), output("w")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+				String.valueOf(maxiter), expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] y = TestUtils.round(getRandomMatrix(rows, 1, 0, 1, 1.0, 136));
+			writeInputMatrixWithMTD("Y", y, true);
+			
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("w");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("w");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmLinregCG.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmLinregCG.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmLinregCG.java
new file mode 100644
index 0000000..25c5f03
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmLinregCG.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmLinregCG extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_LinregCG";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmLinregCG.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private final static double eps = 1e-5;
+	
+	private final static int rows = 1468;
+	private final static int cols = 1007;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static int intercept = 0;
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testLinregCGDenseRewritesCP() {
+		runLinregCGTest(TEST_NAME1, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testLinregCGSparseRewritesCP() {
+		runLinregCGTest(TEST_NAME1, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testLinregCGDenseCP() {
+		runLinregCGTest(TEST_NAME1, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testLinregCGSparseCP() {
+		runLinregCGTest(TEST_NAME1, false, true, ExecType.CP);
+	}
+
+	/*
+	@Test
+	public void testLinregCGDenseSP() {
+		runGDFOTest(TEST_NAME1, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testLinregCGSparseSP() {
+		runGDFOTest(TEST_NAME1, true, ExecType.SPARK);
+	}
+	*/
+	
+	private void runLinregCGTest( String testname, boolean rewrites, boolean sparse, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("y"),
+				String.valueOf(intercept), String.valueOf(epsilon),
+				String.valueOf(maxiter), output("w")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+				String.valueOf(maxiter), expectedDir());
+	
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 7);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] y = getRandomMatrix(rows, 1, 0, 10, 1.0, 3);
+			writeInputMatrixWithMTD("y", y, true);
+			
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("w");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("w");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMLogreg.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMLogreg.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMLogreg.java
new file mode 100644
index 0000000..394902e
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMLogreg.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmMLogreg extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_MLogreg";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmMLogreg.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private final static double eps = 1e-5;
+	
+	private final static int rows = 3468;
+	private final static int cols = 327;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static int intercept = 0;
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testMlogregBinDenseRewritesCP() {
+		runMlogregTest(TEST_NAME1, 2, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregBinSparseRewritesCP() {
+		runMlogregTest(TEST_NAME1, 2, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregBinDenseCP() {
+		runMlogregTest(TEST_NAME1, 2, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregBinSparseCP() {
+		runMlogregTest(TEST_NAME1, 2, false, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregMulDenseRewritesCP() {
+		runMlogregTest(TEST_NAME1, 5, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregMulSparseRewritesCP() {
+		runMlogregTest(TEST_NAME1, 5, true, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregMulDenseCP() {
+		runMlogregTest(TEST_NAME1, 5, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testMlogregMulSparseCP() {
+		runMlogregTest(TEST_NAME1, 5, false, true, ExecType.CP);
+	}
+
+	@Test
+	public void testMlogregBinDenseRewritesSP() {
+		runMlogregTest(TEST_NAME1, 2, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregBinSparseRewritesSP() {
+		runMlogregTest(TEST_NAME1, 2, true, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregBinDenseSP() {
+		runMlogregTest(TEST_NAME1, 2, false, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregBinSparseSP() {
+		runMlogregTest(TEST_NAME1, 2, false, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregMulDenseRewritesSP() {
+		runMlogregTest(TEST_NAME1, 5, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregMulSparseRewritesSP() {
+		runMlogregTest(TEST_NAME1, 5, true, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregMulDenseSP() {
+		runMlogregTest(TEST_NAME1, 5, false, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testMlogregMulSparseSP() {
+		runMlogregTest(TEST_NAME1, 5, false, true, ExecType.SPARK);
+	}
+	
+	private void runMlogregTest( String testname, int classes, boolean rewrites, boolean sparse, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("Y"),
+				String.valueOf(intercept), String.valueOf(epsilon),
+				String.valueOf(maxiter), output("w")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+				String.valueOf(maxiter), expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 2384);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] y = TestUtils.round(getRandomMatrix(rows, 1, 0.51, classes+0.49, 1.0, 9283));
+			writeInputMatrixWithMTD("Y", y, true);
+			
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("w");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("w");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMSVM.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMSVM.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMSVM.java
new file mode 100644
index 0000000..047ceb0
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmMSVM.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmMSVM extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_MSVM";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmMSVM.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private final static double eps = 1e-5;
+	
+	private final static int rows = 1468;
+	private final static int cols = 1007;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static int intercept = 0;
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testMSVMDenseBinRewritesCP() {
+		runMSVMTest(TEST_NAME1, true, false, 2, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMSparseBinRewritesCP() {
+		runMSVMTest(TEST_NAME1, true, true, 2, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMDenseBinCP() {
+		runMSVMTest(TEST_NAME1, false, false, 2, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMSparseBinCP() {
+		runMSVMTest(TEST_NAME1, false, true, 2, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMDenseMulRewritesCP() {
+		runMSVMTest(TEST_NAME1, true, false, 4, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMSparseMulRewritesCP() {
+		runMSVMTest(TEST_NAME1, true, true, 4, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMDenseMulCP() {
+		runMSVMTest(TEST_NAME1, false, false, 4, ExecType.CP);
+	}
+	
+	@Test
+	public void testMSVMSparseMulCP() {
+		runMSVMTest(TEST_NAME1, false, true, 4, ExecType.CP);
+	}
+	
+	private void runMSVMTest( String testname, boolean rewrites, boolean sparse, int numClasses, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("Y"),
+				String.valueOf(intercept), String.valueOf(epsilon),
+				String.valueOf(maxiter), output("w")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(intercept),String.valueOf(epsilon),
+				String.valueOf(maxiter), expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] y = TestUtils.round(getRandomMatrix(rows, 1, 1, numClasses, 1.0, 136));
+			writeInputMatrixWithMTD("Y", y, true);
+			
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("w");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("w");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmPNMF.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmPNMF.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmPNMF.java
new file mode 100644
index 0000000..5d7e654
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/AlgorithmPNMF.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class AlgorithmPNMF extends AutomatedTestBase 
+{	
+	private final static String TEST_NAME1 = "Algorithm_PNMF";
+	private final static String TEST_DIR = "functions/codegen/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmPNMF.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private final static double eps = 1e-5;
+	
+	private final static int rows = 1468;
+	private final static int cols = 1207;
+	private final static int rank = 20;
+		
+	private final static double sparsity1 = 0.7; //dense
+	private final static double sparsity2 = 0.1; //sparse
+	
+	private final static double epsilon = 0.000000001;
+	private final static double maxiter = 10;
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); 
+	}
+
+	@Test
+	public void testPNMFDenseCP() {
+		runPNMFTest(TEST_NAME1, false, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testPNMFSparseCP() {
+		runPNMFTest(TEST_NAME1, false, true, ExecType.CP);
+	}
+	
+	@Test
+	public void testPNMFDenseSP() {
+		runPNMFTest(TEST_NAME1, false, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testPNMFSparseSP() {
+		runPNMFTest(TEST_NAME1, false, true, ExecType.SPARK);
+	}
+
+	private void runPNMFTest( String testname, boolean rewrites, boolean sparse, ExecType instType)
+	{
+		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		RUNTIME_PLATFORM platformOld = rtplatform;
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; break;
+		}
+	
+		boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+		if( rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK )
+			DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+		try
+		{
+			String TEST_NAME = testname;
+			TestConfiguration config = getTestConfiguration(TEST_NAME);
+			loadTestConfiguration(config);
+			
+			/* This is for running the junit test the new way, i.e., construct the arguments directly */
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			programArgs = new String[]{ "-explain", "-stats",
+				"-config=" + HOME + TEST_CONF, "-args", input("X"), input("W"), input("H"),
+				String.valueOf(rank), String.valueOf(epsilon), String.valueOf(maxiter), 
+				output("W"), output("H")};
+
+			rCmd = getRCmd(inputDir(), String.valueOf(rank), String.valueOf(epsilon), 
+				String.valueOf(maxiter), expectedDir());
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+			
+			//generate actual datasets
+			double[][] X = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 234);
+			writeInputMatrixWithMTD("X", X, true);
+			double[][] W = getRandomMatrix(rows, rank, 0, 0.025, 1.0, 3);
+			writeInputMatrixWithMTD("W", W, true);
+			double[][] H = getRandomMatrix(rank, cols, 0, 0.025, 1.0, 7);
+			writeInputMatrixWithMTD("H", H, true);
+			
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlW = readDMLMatrixFromHDFS("W");
+			HashMap<CellIndex, Double> dmlH = readDMLMatrixFromHDFS("H");
+			HashMap<CellIndex, Double> rW = readRMatrixFromFS("W");
+			HashMap<CellIndex, Double> rH = readRMatrixFromFS("H");
+			TestUtils.compareMatrices(dmlW, rW, eps, "Stat-DML", "Stat-R");
+			TestUtils.compareMatrices(dmlH, rH, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			rtplatform = platformOld;
+			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CellwiseTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/CellwiseTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CellwiseTmplTest.java
new file mode 100644
index 0000000..6313412
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/CellwiseTmplTest.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class CellwiseTmplTest extends AutomatedTestBase 
+{	
+	private static final String TEST_NAME1 = "cellwisetmpl1";
+	private static final String TEST_NAME2 = "cellwisetmpl2";
+	private static final String TEST_NAME3 = "cellwisetmpl3";
+	private static final String TEST_NAME4 = "cellwisetmpl4";
+	private static final String TEST_NAME5 = "cellwisetmpl5";
+	private static final String TEST_NAME6 = "cellwisetmpl6"; //sum
+
+	private static final String TEST_DIR = "functions/codegen/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + CellwiseTmplTest.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private static final double eps = Math.pow(10, -10);
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "1" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "2" }) );
+		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "3" }) );
+		addTestConfiguration( TEST_NAME4, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME4, new String[] { "4" }) );
+		addTestConfiguration( TEST_NAME5, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5, new String[] { "5" }) );
+		addTestConfiguration( TEST_NAME6, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME6, new String[] { "6" }) );
+	}
+		
+	@Test
+	public void testCodegenCellwiseRewrite1() {
+		testCodegenIntegration( TEST_NAME1, true, ExecType.CP );
+	}
+		
+	@Test
+	public void testCodegenCellwiseRewrite2() {
+		testCodegenIntegration( TEST_NAME2, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwiseRewrite3() {
+		testCodegenIntegration( TEST_NAME3, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwiseRewrite4() 
+	{
+		testCodegenIntegration( TEST_NAME4, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwiseRewrite5() {
+		testCodegenIntegration( TEST_NAME5, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwiseRewrite6() {
+		testCodegenIntegration( TEST_NAME6, true, ExecType.CP  );
+	}
+
+	@Test
+	public void testCodegenCellwise1() {
+		testCodegenIntegration( TEST_NAME1, false, ExecType.CP );
+	}
+		
+	@Test
+	public void testCodegenCellwise2() {
+		testCodegenIntegration( TEST_NAME2, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwise3() {
+		testCodegenIntegration( TEST_NAME3, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwise4() 
+	{
+		testCodegenIntegration( TEST_NAME4, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwise5() {
+		testCodegenIntegration( TEST_NAME5, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testCodegenCellwise6() {
+		testCodegenIntegration( TEST_NAME6, false, ExecType.CP  );
+	}
+
+	@Test
+	public void testCodegenCellwiseRewrite1_sp() {
+		testCodegenIntegration( TEST_NAME1, true, ExecType.SPARK );
+	}
+	
+	private void testCodegenIntegration( String testname, boolean rewrites, ExecType instType )
+	{	
+		
+		boolean oldRewrites = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: 
+				rtplatform = RUNTIME_PLATFORM.SPARK;
+				DMLScript.USE_LOCAL_SPARK_CONFIG = true; 
+				break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "runtime", "-stats", 
+					"-config=" + HOME + TEST_CONF, "-args", output("S") };
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(inputDir(), expectedDir());			
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			//System.exit(1);
+			if(testname.equals(TEST_NAME6)) //tak+
+			{
+				//compare scalars 
+				HashMap<CellIndex, Double> dmlfile = readDMLScalarFromHDFS("S");
+				HashMap<CellIndex, Double> rfile  = readRScalarFromFS("S");
+				TestUtils.compareScalars((Double) dmlfile.values().toArray()[0], (Double) rfile.values().toArray()[0],0);
+			}
+			else
+			{
+				//compare matrices 
+				HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("S");
+				HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("S");	
+				TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+				if( !(rewrites && testname.equals(TEST_NAME2)) ) //sigmoid
+					Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+			}
+		}
+		finally {
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrites;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}	
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/java/org/apache/sysml/test/integration/functions/codegen/DAGCellwiseTmplTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/codegen/DAGCellwiseTmplTest.java b/src/test/java/org/apache/sysml/test/integration/functions/codegen/DAGCellwiseTmplTest.java
new file mode 100644
index 0000000..65be916
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/codegen/DAGCellwiseTmplTest.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+public class DAGCellwiseTmplTest extends AutomatedTestBase 
+{	
+	private static final String TEST_NAME1 = "DAGcellwisetmpl1";
+	private static final String TEST_NAME2 = "DAGcellwisetmpl2";
+	private static final String TEST_NAME3 = "DAGcellwisetmpl3";
+	
+	private static final String TEST_DIR = "functions/codegen/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + DAGCellwiseTmplTest.class.getSimpleName() + "/";
+	private final static String TEST_CONF = "SystemML-config-codegen.xml";
+	
+	private static final double eps = Math.pow(10, -10);
+	
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration( TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "S" }) );
+		addTestConfiguration( TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "S" }) );
+		addTestConfiguration( TEST_NAME3, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME3, new String[] { "S" }) );
+	}
+		
+	@Test
+	public void testDAGMatrixCellwiseRewrite1() {
+		testCodegenIntegration( TEST_NAME1, true, false, ExecType.CP );
+	}
+		
+	@Test
+	public void testDAGMatrixCellwiseRewrite2() {
+		testCodegenIntegration( TEST_NAME2, true, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testDAGMatrixCellwiseRewrite3() {
+		testCodegenIntegration( TEST_NAME3, true, false, ExecType.CP  );
+	}
+
+	@Test
+	public void testDAGMatrixCellwise1() {
+		testCodegenIntegration( TEST_NAME1, false, false, ExecType.CP );
+	}
+		
+	@Test
+	public void testDAGMatrixCellwise2() {
+		testCodegenIntegration( TEST_NAME2, false, false, ExecType.CP  );
+	}
+	
+	@Test
+	public void testDAGMatrixCellwise3() {
+		testCodegenIntegration( TEST_NAME3, false, false, ExecType.CP  );
+	}
+
+	@Test
+	public void testDAGVectorCellwiseRewrite1() {
+		testCodegenIntegration( TEST_NAME1, true, true, ExecType.CP );
+	}
+		
+	@Test
+	public void testDAGVectorCellwiseRewrite2() {
+		testCodegenIntegration( TEST_NAME2, true, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testDAGVectorCellwiseRewrite3() {
+		testCodegenIntegration( TEST_NAME3, true, true, ExecType.CP  );
+	}
+
+	@Test
+	public void testDAGVectorCellwise1() {
+		testCodegenIntegration( TEST_NAME1, false, true, ExecType.CP );
+	}
+		
+	@Test
+	public void testDAGVectorCellwise2() {
+		testCodegenIntegration( TEST_NAME2, false, true, ExecType.CP  );
+	}
+	
+	@Test
+	public void testDAGVectorCellwise3() {
+		testCodegenIntegration( TEST_NAME3, false, true, ExecType.CP  );
+	}
+	
+	private void testCodegenIntegration( String testname, boolean rewrites, boolean vector, ExecType instType )
+	{	
+		
+		boolean oldRewrites = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+		
+		switch( instType ){
+			case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+			case SPARK: 
+				rtplatform = RUNTIME_PLATFORM.SPARK;
+				DMLScript.USE_LOCAL_SPARK_CONFIG = true; 
+				break;
+			default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+		}
+		
+		try
+		{
+			TestConfiguration config = getTestConfiguration(testname);
+			loadTestConfiguration(config);
+			
+			int cols = vector ? 1 : 50;
+			
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + testname + ".dml";
+			programArgs = new String[]{"-explain", "runtime", "-stats", 
+					"-config=" + HOME + TEST_CONF, "-args", String.valueOf(cols), output("S") };
+			
+			fullRScriptName = HOME + testname + ".R";
+			rCmd = getRCmd(String.valueOf(cols), expectedDir());			
+
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
+
+			runTest(true, false, null, -1); 
+			runRScript(true); 
+			
+			//compare matrices 
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("S");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromFS("S");	
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+			Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof"));
+		}
+		finally {
+			OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldRewrites;
+			OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+			OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+		}
+	}	
+}

[7/9] incubator-systemml git commit: [SYSTEMML-1286] Code generator compiler integration, incl tests

Posted by mb...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_GLM.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_GLM.dml b/src/test/scripts/functions/codegen/Algorithm_GLM.dml
new file mode 100644
index 0000000..d8eb966
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_GLM.dml
@@ -0,0 +1,1053 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read ($1);
+Y = read ($2);
+
+fileO = " ";
+fileLog = " ";
+
+intercept_status = $3
+eps = $4
+max_iteration_IRLS = $5;
+max_iteration_CG = $5;
+
+distribution_type = $6;
+variance_as_power_of_the_mean = $7;
+link_type = $8; 
+
+if( distribution_type != 1 ) {
+  link_as_power_of_the_mean = $9;
+  bernoulli_No_label = 0.0;
+} else {
+  link_as_power_of_the_mean = 1.0;
+  bernoulli_No_label = $9; 
+}
+
+dispersion = 0.0;
+regularization = 0.001;
+
+
+variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
+link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
+bernoulli_No_label = as.double (bernoulli_No_label);
+dispersion = as.double (dispersion);
+eps = as.double (eps);
+
+
+# Default values for output statistics:
+
+termination_code     = 0;
+min_beta             = 0.0 / 0.0;
+i_min_beta           = 0.0 / 0.0;
+max_beta             = 0.0 / 0.0;
+i_max_beta           = 0.0 / 0.0;
+intercept_value      = 0.0 / 0.0;
+dispersion           = 0.0 / 0.0;
+estimated_dispersion = 0.0 / 0.0;
+deviance_nodisp      = 0.0 / 0.0;
+deviance             = 0.0 / 0.0;
+
+print("BEGIN GLM SCRIPT");
+
+num_records  = nrow (X);
+num_features = ncol (X);
+zeros_r = matrix (0, rows = num_records, cols = 1);
+ones_r = 1 + zeros_r;
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = append (X, ones_r);
+    num_features = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = num_features, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [num_features, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, num_features] = ones_r
+    avg_X_cols = t(colSums(X)) / num_records;
+    var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
+    is_unsafe = ppred (var_X_cols, 0.0, "<=");
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [num_features, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [num_features, 1] = 0;
+    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+    scale_X = matrix (1, rows = num_features, cols = 1);
+    shift_X = matrix (0, rows = num_features, cols = 1);
+    rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
+
+# Initialize other input-dependent parameters
+
+lambda = scale_lambda * regularization;
+if (max_iteration_CG == 0) {
+    max_iteration_CG = num_features;
+}
+
+# In Bernoulli case, convert one-column "Y" into two-column
+
+if (distribution_type == 2 & ncol(Y) == 1)
+{
+    is_Y_negative = ppred (Y, bernoulli_No_label, "==");
+    Y = append (1 - is_Y_negative, is_Y_negative);
+    count_Y_negative = sum (is_Y_negative);
+    if (count_Y_negative == 0) {
+        stop ("GLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label");
+    }
+    if (count_Y_negative == nrow(Y)) {
+        stop ("GLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label");
+    }
+}
+
+# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]
+
+if (link_type == 0)
+{
+    if (distribution_type == 1) {
+        link_type = 1;
+        link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
+    } else { if (distribution_type == 2) {
+            link_type = 2;
+}   }   }
+
+# For power distributions and/or links, we use two constants,
+# "variance as power of the mean" and "link_as_power_of_the_mean",
+# to specify the variance and the link as arbitrary powers of the
+# mean.  However, the variance-powers of 1.0 (Poisson family) and
+# 2.0 (Gamma family) have to be treated as special cases, because
+# these values integrate into logarithms.  The link-power of 0.0
+# is also special as it represents the logarithm link.
+
+num_response_columns = ncol (Y);
+
+is_supported = check_if_supported (num_response_columns, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+if (is_supported == 1)
+{
+
+#####   INITIALIZE THE BETAS   #####
+
+[beta, saturated_log_l, isNaN] = 
+    glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);
+if (isNaN == 0)
+{
+
+#####  START OF THE MAIN PART  #####
+
+sum_X_sq = sum (rowSums_X_sq);
+trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
+###  max_trust_delta = trust_delta * 10000.0;
+log_l = 0.0;
+deviance_nodisp = 0.0;
+new_deviance_nodisp = 0.0;
+isNaN_log_l = 2;
+newbeta = beta;
+g = matrix (0.0, rows = num_features, cols = 1);
+g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+accept_new_beta = 1;
+reached_trust_boundary = 0;
+neg_log_l_change_predicted = 0.0;
+i_IRLS = 0;
+
+print ("BEGIN IRLS ITERATIONS...");
+
+ssX_newbeta = diag (scale_X) %*% newbeta;
+ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+all_linear_terms = X %*% ssX_newbeta;
+
+[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
+    (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+if (isNaN_new_log_l == 0) {
+    new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+    new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+}
+
+if (fileLog != " ") {
+    log_str = "POINT_STEP_NORM," + i_IRLS + "," + sqrt (sum (beta ^ 2));
+    log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l));
+    log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms));
+    log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms));
+} else {
+    log_str = " ";
+}
+
+# set w to avoid 'Initialization of w depends on if-else/while execution' warnings
+w = matrix (0.0, rows=1, cols=1);
+while (termination_code == 0)
+{
+    accept_new_beta = 1;
+    
+    if (i_IRLS > 0)
+    {
+        if (isNaN_log_l == 0) {
+            accept_new_beta = 0;
+        }
+
+# Decide whether to accept a new iteration point and update the trust region
+# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+
+        rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
+        if (rho < 0.25 | isNaN_new_log_l == 1) {
+            trust_delta = 0.25 * trust_delta;
+        }
+        if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
+            trust_delta = 2 * trust_delta;
+            
+### if (trust_delta > max_trust_delta) {
+###     trust_delta = max_trust_delta;
+### }
+
+        }
+        if (rho > 0.1 & isNaN_new_log_l == 0) {
+            accept_new_beta = 1;
+        }
+    }
+
+    if (fileLog != " ") {
+        log_str = append (log_str, "IS_POINT_UPDATED," + i_IRLS + "," + accept_new_beta);
+        log_str = append (log_str, "TRUST_DELTA,"      + i_IRLS + "," + trust_delta);
+    }
+    if (accept_new_beta == 1)
+    {
+        beta = newbeta;  log_l = new_log_l;  deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
+        
+        [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+        
+        # We introduced these variables to avoid roundoff errors:
+        #     g_Y = y_residual / (y_var * link_grad);
+        #     w   = 1.0 / (y_var * link_grad * link_grad);
+                      
+        gXY = - t(X) %*% g_Y;
+        g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
+        g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+        
+        if (fileLog != " ") {
+            log_str = append (log_str, "GRADIENT_NORM," + i_IRLS + "," + g_norm);
+        }
+    }
+    
+    [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = 
+        get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
+
+    newbeta = beta + z;
+    
+    ssX_newbeta = diag (scale_X) %*% newbeta;
+    ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
+    all_linear_terms = X %*% ssX_newbeta;
+    
+    [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
+        (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+
+    if (isNaN_new_log_l == 0) {
+        new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
+        new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+    }
+        
+    log_l_change = new_log_l - log_l;               # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
+
+    if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & 
+        (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) )  
+    {
+        termination_code = 1;
+    }
+    rho = - log_l_change / neg_log_l_change_predicted;
+    z_norm = sqrt (sum (z * z));
+    
+    [z_norm_m, z_norm_e] = round_to_print (z_norm);
+    [trust_delta_m, trust_delta_e] = round_to_print (trust_delta);
+    [rho_m, rho_e] = round_to_print (rho);
+    [new_log_l_m, new_log_l_e] = round_to_print (new_log_l);
+    [log_l_change_m, log_l_change_e] = round_to_print (log_l_change);
+    [g_norm_m, g_norm_e] = round_to_print (g_norm);
+
+    i_IRLS = i_IRLS + 1;
+    print ("Iter #" + i_IRLS + " completed"
+        + ", ||z|| = " + z_norm_m + "E" + z_norm_e
+        + ", trust_delta = " + trust_delta_m + "E" + trust_delta_e
+        + ", reached = " + reached_trust_boundary
+        + ", ||g|| = " + g_norm_m + "E" + g_norm_e
+        + ", new_log_l = " + new_log_l_m + "E" + new_log_l_e
+        + ", log_l_change = " + log_l_change_m + "E" + log_l_change_e
+        + ", rho = " + rho_m + "E" + rho_e);
+        
+    if (fileLog != " ") {
+        log_str = append (log_str, "NUM_CG_ITERS,"     + i_IRLS + "," + num_CG_iters);
+        log_str = append (log_str, "IS_TRUST_REACHED," + i_IRLS + "," + reached_trust_boundary);
+        log_str = append (log_str, "POINT_STEP_NORM,"  + i_IRLS + "," + z_norm);
+        log_str = append (log_str, "OBJECTIVE,"        + i_IRLS + "," + (- new_log_l));
+        log_str = append (log_str, "OBJ_DROP_REAL,"    + i_IRLS + "," + log_l_change);
+        log_str = append (log_str, "OBJ_DROP_PRED,"    + i_IRLS + "," + (- neg_log_l_change_predicted));
+        log_str = append (log_str, "OBJ_DROP_RATIO,"   + i_IRLS + "," + rho);
+        log_str = append (log_str, "LINEAR_TERM_MIN,"  + i_IRLS + "," + min (all_linear_terms));
+        log_str = append (log_str, "LINEAR_TERM_MAX,"  + i_IRLS + "," + max (all_linear_terms));
+    }
+        
+    if (i_IRLS == max_iteration_IRLS) {
+        termination_code = 2;
+    }
+}
+
+beta = newbeta;
+log_l = new_log_l;
+deviance_nodisp = new_deviance_nodisp;
+
+if (termination_code == 1) {
+    print ("Converged in " + i_IRLS + " steps.");
+} else {
+    print ("Did not converge.");
+}
+
+ssX_beta = diag (scale_X) %*% beta;
+ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
+if (intercept_status == 2) {
+    beta_out = append (ssX_beta, beta);
+} else {
+    beta_out = ssX_beta;
+}
+
+write (beta_out, $10);
+
+if (intercept_status == 1 | intercept_status == 2) {
+    intercept_value = as.scalar (beta_out [num_features, 1]);
+    beta_noicept = beta_out [1 : (num_features - 1), 1];
+} else {
+    beta_noicept = beta_out [1 : num_features, 1];
+}
+min_beta = min (beta_noicept);
+max_beta = max (beta_noicept);
+tmp_i_min_beta = rowIndexMin (t(beta_noicept))
+i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
+tmp_i_max_beta = rowIndexMax (t(beta_noicept))
+i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);
+
+#####  OVER-DISPERSION PART  #####
+
+all_linear_terms = X %*% ssX_beta;
+[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+    
+pearson_residual_sq = g_Y ^ 2 / w;
+pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0);
+# pearson_residual_sq = (y_residual ^ 2) / y_var;
+
+if (num_records > num_features) {
+    estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
+}
+if (dispersion <= 0.0) {
+    dispersion = estimated_dispersion;
+}
+deviance = deviance_nodisp / dispersion;
+
+#####  END OF THE MAIN PART  #####
+
+} else { print ("Input matrices are out of range.  Terminating the DML."); termination_code = 3; }
+} else { print ("Distribution/Link not supported.  Terminating the DML."); termination_code = 4; }
+
+str = "TERMINATION_CODE," + termination_code;
+str = append (str, "BETA_MIN," + min_beta);
+str = append (str, "BETA_MIN_INDEX," + i_min_beta);
+str = append (str, "BETA_MAX," + max_beta);
+str = append (str, "BETA_MAX_INDEX," + i_max_beta);
+str = append (str, "INTERCEPT," + intercept_value);
+str = append (str, "DISPERSION," + dispersion);
+str = append (str, "DISPERSION_EST," + estimated_dispersion);
+str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
+str = append (str, "DEVIANCE_SCALED," + deviance);
+print (str);
+
+
+
+
+check_if_supported = 
+    function (int ncol_y, int dist_type, double var_power, int link_type, double link_power)
+    return   (int is_supported)
+{
+    is_supported = 0;
+    if (ncol_y == 1 & dist_type == 1 & link_type == 1)
+    { # POWER DISTRIBUTION
+        is_supported = 1;
+        if (var_power == 0.0 & link_power == -1.0) {print ("Gaussian.inverse");      } else {
+        if (var_power == 0.0 & link_power ==  0.0) {print ("Gaussian.log");          } else {
+        if (var_power == 0.0 & link_power ==  0.5) {print ("Gaussian.sqrt");         } else {
+        if (var_power == 0.0 & link_power ==  1.0) {print ("Gaussian.id");           } else {
+        if (var_power == 0.0                     ) {print ("Gaussian.power_nonlog"); } else {
+        if (var_power == 1.0 & link_power == -1.0) {print ("Poisson.inverse");       } else {
+        if (var_power == 1.0 & link_power ==  0.0) {print ("Poisson.log");           } else {
+        if (var_power == 1.0 & link_power ==  0.5) {print ("Poisson.sqrt");          } else {
+        if (var_power == 1.0 & link_power ==  1.0) {print ("Poisson.id");            } else {
+        if (var_power == 1.0                     ) {print ("Poisson.power_nonlog");  } else {
+        if (var_power == 2.0 & link_power == -1.0) {print ("Gamma.inverse");         } else {
+        if (var_power == 2.0 & link_power ==  0.0) {print ("Gamma.log");             } else {
+        if (var_power == 2.0 & link_power ==  0.5) {print ("Gamma.sqrt");            } else {
+        if (var_power == 2.0 & link_power ==  1.0) {print ("Gamma.id");              } else {
+        if (var_power == 2.0                     ) {print ("Gamma.power_nonlog");    } else {
+        if (var_power == 3.0 & link_power == -2.0) {print ("InvGaussian.1/mu^2");    } else {
+        if (var_power == 3.0 & link_power == -1.0) {print ("InvGaussian.inverse");   } else {
+        if (var_power == 3.0 & link_power ==  0.0) {print ("InvGaussian.log");       } else {
+        if (var_power == 3.0 & link_power ==  0.5) {print ("InvGaussian.sqrt");      } else {
+        if (var_power == 3.0 & link_power ==  1.0) {print ("InvGaussian.id");        } else {
+        if (var_power == 3.0                     ) {print ("InvGaussian.power_nonlog");}else{
+        if (                   link_power ==  0.0) {print ("PowerDist.log");         } else {
+                                                    print ("PowerDist.power_nonlog");
+    }   }}}}} }}}}} }}}}} }}}}} }}
+    if (ncol_y == 1 & dist_type == 2)
+    {
+        print ("Error: Bernoulli response matrix has not been converted into two-column format.");
+    }
+    if (ncol_y == 2 & dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        is_supported = 1;
+        if (link_type == 1 & link_power == -1.0) {print ("Binomial.inverse");        } else {
+        if (link_type == 1 & link_power ==  0.0) {print ("Binomial.log");            } else {
+        if (link_type == 1 & link_power ==  0.5) {print ("Binomial.sqrt");           } else {
+        if (link_type == 1 & link_power ==  1.0) {print ("Binomial.id");             } else {
+        if (link_type == 1)                      {print ("Binomial.power_nonlog");   } else {
+        if (link_type == 2)                      {print ("Binomial.logit");          } else {
+        if (link_type == 3)                      {print ("Binomial.probit");         } else {
+        if (link_type == 4)                      {print ("Binomial.cloglog");        } else {
+        if (link_type == 5)                      {print ("Binomial.cauchit");        }
+    }   }}}}} }}}
+    if (is_supported == 0) {
+        print ("Response matrix with " + ncol_y + " columns, distribution family (" + dist_type + ", " + var_power
+             + ") and link family (" + link_type + ", " + link_power + ") are NOT supported together.");
+    }
+}
+
+glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
+return (Matrix[double] beta, double saturated_log_l, int isNaN)
+{
+    saturated_log_l = 0.0;
+    isNaN = 0;
+    y_corr = Y [, 1];
+    if (dist_type == 2) {
+        n_corr = rowSums (Y);
+        is_n_zero = ppred (n_corr, 0.0, "==");
+        y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;    
+    }
+    linear_terms = y_corr;
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+        if          (link_power ==  0.0) {
+            if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+                is_zero_y_corr = ppred (y_corr, 0.0, "==");
+                linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_power ==  1.0) {
+            linear_terms = y_corr;
+        } else { if (link_power == -1.0) {
+            linear_terms = 1.0 / y_corr;
+        } else { if (link_power ==  0.5) {
+            if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+                linear_terms = sqrt (y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_power >   0.0) {
+            if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+                is_zero_y_corr = ppred (y_corr, 0.0, "==");
+                linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+            } else { isNaN = 1; }
+        } else {
+            if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
+                linear_terms = y_corr ^ link_power;
+            } else { isNaN = 1; }
+        }}}}}
+    }
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        if          (link_type == 1 & link_power == 0.0)  { # Binomial.log
+            if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+                is_zero_y_corr = ppred (y_corr, 0.0, "==");
+                linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { isNaN = 1; }
+        } else { if (link_type == 1 & link_power >  0.0)  { # Binomial.power_nonlog pos
+            if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+                is_zero_y_corr = ppred (y_corr, 0.0, "==");
+                linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
+            } else { isNaN = 1; }
+        } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
+            if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
+                linear_terms = y_corr ^ link_power;
+            } else { isNaN = 1; }
+        } else { 
+            is_zero_y_corr = ppred (y_corr, 0.0, "<=");
+            is_one_y_corr  = ppred (y_corr, 1.0, ">=");
+            y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
+            if (link_type == 2)                           { # Binomial.logit
+                linear_terms = log (y_corr / (1.0 - y_corr)) 
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 3)                  { # Binomial.probit
+                y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, ">");
+                t = sqrt (- 2.0 * log (y_below_half));
+                approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
+                linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 0.5, ">"))
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 4)                  { # Binomial.cloglog
+                linear_terms = log (- log (1.0 - y_corr))
+                    - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+            } else { if (link_type == 5)                  { # Binomial.cauchit
+                linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795)
+                    + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
+        }}  }}}}}
+    }
+    
+    if (isNaN == 0) {
+        [saturated_log_l, isNaN] = 
+            glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
+    }
+    
+    if ((dist_type == 1 & link_type == 1 & link_power == 0.0) |
+        (dist_type == 2 & link_type >= 2))
+    {    
+        desired_eta = 0.0;
+    } else { if (link_type == 1 & link_power == 0.0) {
+        desired_eta = log (0.5);
+    } else { if (link_type == 1) {
+        desired_eta = 0.5 ^ link_power;
+    } else {
+        desired_eta = 0.5;
+    }}}
+    
+    beta = matrix (0.0, rows = ncol(X), cols = 1);
+    
+    if (desired_eta != 0.0) {
+        if (icept_status == 1 | icept_status == 2) {
+            beta [nrow(beta), 1] = desired_eta;
+        } else {
+            # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
+            # Note that "ssX_transform" is trivial here, hence ignored
+            
+            beta = straightenX (X, 0.000001, max_iter_CG);  
+            beta = beta * desired_eta;
+}   }   }
+
+
+glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
+                     int dist_type, double var_power, int link_type, double link_power)
+    return (Matrix[double] g_Y, Matrix[double] w)
+    # ORIGINALLY we returned more meaningful vectors, namely:
+    # Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
+    # Matrix[double] link_gradient : derivative of the link function
+    # Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) function
+    # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors
+    # and skip over the "meaningful intermediaries".  Now we output these two variables:
+    #     g_Y = y_residual / (var_function * link_gradient);
+    #     w   = 1.0 / (var_function * link_gradient ^ 2);
+{
+    num_records = nrow (linear_terms);
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+    ones_r = 1 + zeros_r;
+    g_Y  = zeros_r;
+    w  = zeros_r;
+
+    # Some constants
+
+    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
+    ones_2 = matrix (1.0, rows = 1, cols = 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] = 0.0;
+    flip_pos = matrix (0, rows = 2, cols = 2);
+    flip_neg = flip_pos;
+    flip_pos [1, 2] = 1;
+    flip_pos [2, 1] = 1;
+    flip_neg [1, 2] = -1;
+    flip_neg [2, 1] = 1;
+    
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+        y_mean = zeros_r;
+        if          (link_power ==  0.0) {
+            y_mean = exp (linear_terms);
+            y_mean_pow = y_mean ^ (1 - var_power);
+            w   = y_mean_pow * y_mean;
+            g_Y = y_mean_pow * (Y - y_mean);
+        } else { if (link_power ==  1.0) {
+            y_mean = linear_terms;
+            w   = y_mean ^ (- var_power);
+            g_Y = w * (Y - y_mean);
+        } else {
+            y_mean = linear_terms ^ (1.0 / link_power);
+            c1  = (1 - var_power) / link_power - 1;
+            c2  = (2 - var_power) / link_power - 2;
+            g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
+            w   = (linear_terms ^ c2) / (link_power ^ 2);
+    }   }}
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+        if (link_type == 1) { # BINOMIAL.POWER LINKS
+            if (link_power == 0.0)  { # Binomial.log
+                vec1 = 1 / (exp (- linear_terms) - 1);
+                g_Y = Y [, 1] - Y [, 2] * vec1;
+                w   = rowSums (Y) * vec1;
+            } else {                  # Binomial.nonlog
+                vec1 = zeros_r;
+                if (link_power == 0.5)  {
+                    vec1 = 1 / (1 - linear_terms ^ 2);
+                } else { if (sum (ppred (linear_terms, 0.0, "<")) == 0) {
+                    vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
+                } else {isNaN = 1;}}
+                # We want a "zero-protected" version of
+                #     vec2 = Y [, 1] / linear_terms;
+                is_y_0 = ppred (Y [, 1], 0.0, "==");
+                vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
+                g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
+                w   =  rowSums (Y) * vec1 / link_power ^ 2;
+            }
+        } else {
+            is_LT_pos_infinite = ppred (linear_terms,  1.0/0.0, "==");
+            is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "==");
+            is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+            finite_linear_terms = replace (target =        linear_terms, pattern =  1.0/0.0, replacement = 0);
+            finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
+            if (link_type == 2)                           { # Binomial.logit
+                Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+                Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
+                w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
+            } else { if (link_type == 3)                  { # Binomial.probit
+                is_lt_pos = ppred (linear_terms, 0.0, ">=");
+                t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+                pt_gp = t_gp * ( 0.254829592 
+                      + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                      + t_gp * (-1.453152027 
+                      + t_gp *   1.061405429))));
+                the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
+                vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
+                vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
+                w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
+                g_Y = one_over_sqrt_two_pi * vec2 / vec1;
+            } else { if (link_type == 4)                  { # Binomial.cloglog
+                the_exp = exp (linear_terms)
+                the_exp_exp = exp (- the_exp);
+                is_too_small = ppred (10000000 + the_exp, 10000000, "==");
+                the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
+                g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
+                w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
+            } else { if (link_type == 5)                  { # Binomial.cauchit
+                Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
+                Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+                y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
+                var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
+                link_gradient_normalized = (1 + linear_terms ^ 2) * 3.1415926535897932384626433832795;
+                g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
+                w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
+            }}}}   
+        }
+    }
+}
+
+
+glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
+        int dist_type, double var_power, int link_type, double link_power)
+    return (double log_l, int isNaN)
+{
+    isNaN = 0;
+    log_l = 0.0;
+    num_records = nrow (Y);
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+    
+    if (dist_type == 1 & link_type == 1)
+    { # POWER DISTRIBUTION
+        b_cumulant = zeros_r;
+        natural_parameters = zeros_r;
+        is_natural_parameter_log_zero = zeros_r;
+        if          (var_power == 1.0 & link_power == 0.0)  { # Poisson.log
+            b_cumulant = exp (linear_terms);
+            is_natural_parameter_log_zero = ppred (linear_terms, -1.0/0.0, "==");
+            natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0);
+        } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
+            if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
+                b_cumulant = linear_terms;
+                is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
+                natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
+            if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
+                b_cumulant = linear_terms ^ 2;
+                is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
+                natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0 & link_power  > 0.0)  { # Poisson.power_nonlog, pos
+            if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
+                is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
+                b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
+                natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
+            } else {isNaN = 1;}
+        } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
+            if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                b_cumulant = linear_terms ^ (1.0 / link_power);
+                natural_parameters = log (linear_terms) / link_power;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
+            if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                b_cumulant = - log (linear_terms);
+                natural_parameters = - linear_terms;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
+            if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                b_cumulant = log (linear_terms);
+                natural_parameters = - 1.0 / linear_terms;
+            } else {isNaN = 1;}
+        } else { if (var_power == 2.0 & link_power ==  0.0) { # Gamma.log
+            b_cumulant = linear_terms;
+            natural_parameters = - exp (- linear_terms);
+        } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
+            if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                b_cumulant = log (linear_terms) / link_power;
+                natural_parameters = - linear_terms ^ (- 1.0 / link_power);
+            } else {isNaN = 1;}
+        } else { if                    (link_power ==  0.0) { # PowerDist.log
+            natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
+            b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
+        } else {                                              # PowerDist.power_nonlog
+            if          (-2 * link_power == 1.0 - var_power) {
+                natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
+            } else { if (-1 * link_power == 1.0 - var_power) {
+                natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
+            } else { if (     link_power == 1.0 - var_power) {
+                natural_parameters = linear_terms / (1.0 - var_power);
+            } else { if ( 2 * link_power == 1.0 - var_power) {
+                natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
+            } else {
+                if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                    power = (1.0 - var_power) / link_power;
+                    natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
+                } else {isNaN = 1;}
+            }}}}
+            if          (-2 * link_power == 2.0 - var_power) {
+                b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
+            } else { if (-1 * link_power == 2.0 - var_power) {
+                b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
+            } else { if (     link_power == 2.0 - var_power) {
+                b_cumulant = linear_terms / (2.0 - var_power);
+            } else { if ( 2 * link_power == 2.0 - var_power) {
+                b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
+            } else {
+                if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
+                    power = (2.0 - var_power) / link_power;
+                    b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
+                } else {isNaN = 1;}
+            }}}}
+        }}}}} }}}}}
+        if (sum (is_natural_parameter_log_zero * abs (Y)) > 0.0) {
+            log_l = -1.0 / 0.0;
+            isNaN = 1;
+        }
+        if (isNaN == 0)
+        {
+            log_l = sum (Y * natural_parameters - b_cumulant);
+            if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                log_l = -1.0 / 0.0;
+                isNaN = 1;
+    }   }   }
+    
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+    
+        [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
+        
+        if (isNaN == 0) {            
+            does_prob_contradict = ppred (Y_prob, 0.0, "<=");
+            if (sum (does_prob_contradict * abs (Y)) == 0.0) {
+                log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
+                if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
+                    isNaN = 1;
+                }
+            } else {
+                log_l = -1.0 / 0.0;
+                isNaN = 1;
+    }   }   }
+    
+    if (isNaN == 1) {
+        log_l = - 1.0 / 0.0; 
+    }
+}
+
+
+
+binomial_probability_two_column =
+    function (Matrix[double] linear_terms, int link_type, double link_power)
+    return   (Matrix[double] Y_prob, int isNaN)
+{
+    isNaN = 0;
+    num_records = nrow (linear_terms);
+
+    # Define some auxiliary matrices
+
+    ones_2 = matrix (1.0, rows = 1, cols = 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] = 0.0;
+
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+    ones_r = 1.0 + zeros_r;
+
+    # Begin the function body
+
+    Y_prob = zeros_r %*% ones_2;
+    if (link_type == 1) { # Binomial.power
+        if          (link_power == 0.0) { # Binomial.log
+            Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;    
+        } else { if (link_power == 0.5) { # Binomial.sqrt
+            Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;    
+        } else {                          # Binomial.power_nonlog
+            if (sum (ppred (linear_terms, 0.0, "<")) == 0) {
+                Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;    
+            } else {isNaN = 1;}
+        }}
+    } else {              # Binomial.non_power
+        is_LT_pos_infinite = ppred (linear_terms,  1.0/0.0, "==");
+        is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "==");
+        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
+        finite_linear_terms = replace (target =        linear_terms, pattern =  1.0/0.0, replacement = 0);
+        finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
+        if (link_type == 2)             { # Binomial.logit
+            Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
+            Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
+        } else { if (link_type == 3)    { # Binomial.probit
+            lt_pos_neg = ppred (finite_linear_terms, 0.0, ">=") %*% p_one_m_one + ones_r %*% zero_one;
+            t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
+            pt_gp = t_gp * ( 0.254829592 
+                  + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+                  + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+                  + t_gp * (-1.453152027 
+                  + t_gp *   1.061405429))));
+            the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
+            Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
+        } else { if (link_type == 4)    { # Binomial.cloglog
+            the_exp = exp (finite_linear_terms);
+            the_exp_exp = exp (- the_exp);
+            is_too_small = ppred (10000000 + the_exp, 10000000, "==");
+            Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
+            Y_prob [, 2] = the_exp_exp;
+        } else { if (link_type == 5)    { # Binomial.cauchit
+            Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
+        } else {
+            isNaN = 1;
+        }}}}
+        Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
+}   }            
+
+
+# THE CG-STEIHAUG PROCEDURE SCRIPT
+
+# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
+# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
+# under constraint:  ||z|| <= trust_delta.
+# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
+# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
+# is given separately because sparse "X" may become dense after applying the transform.
+#
+get_CG_Steihaug_point =
+    function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
+    Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
+    return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
+{
+    trust_delta_sq = trust_delta ^ 2;
+    size_CG = nrow (g);
+    z = matrix (0.0, rows = size_CG, cols = 1);
+    neg_log_l_change = 0.0;
+    reached_trust_boundary = 0;
+    g_reg = g + lambda * beta;
+    r_CG = g_reg;
+    p_CG = -r_CG;
+    rr_CG = sum(r_CG * r_CG);
+    eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
+    converged_CG = 0;
+    if (rr_CG < eps_CG) {
+        converged_CG = 1;
+    }
+    
+    max_iteration_CG = max_iter_CG;
+    if (max_iteration_CG <= 0) {
+        max_iteration_CG = size_CG;
+    }
+    i_CG = 0;
+    while (converged_CG == 0)
+    {
+        i_CG = i_CG + 1;
+        ssX_p_CG = diag (scale_X) %*% p_CG;
+        ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
+        temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
+        q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
+        pq_CG = sum (p_CG * q_CG);
+        if (pq_CG <= 0) {
+            pp_CG = sum (p_CG * p_CG);  
+            if (pp_CG > 0) {
+                [z, neg_log_l_change] = 
+                    get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                reached_trust_boundary = 1;
+            } else {
+                neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+            }
+            converged_CG = 1;
+        }
+        if (converged_CG == 0) {
+            alpha_CG = rr_CG / pq_CG;
+            new_z = z + alpha_CG * p_CG;
+            if (sum(new_z * new_z) >= trust_delta_sq) {
+                pp_CG = sum (p_CG * p_CG);  
+                [z, neg_log_l_change] = 
+                    get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
+                reached_trust_boundary = 1;
+                converged_CG = 1;
+            }
+            if (converged_CG == 0) {
+                z = new_z;
+                old_rr_CG = rr_CG;
+                r_CG = r_CG + alpha_CG * q_CG;
+                rr_CG = sum(r_CG * r_CG);
+                if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
+                    neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
+                    reached_trust_boundary = 0;
+                    converged_CG = 1;
+                }
+                if (converged_CG == 0) {
+                    p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
+}   }   }   }   }
+
+
+# An auxiliary function used twice inside the CG-STEIHAUG loop:
+get_trust_boundary_point = 
+    function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
+              Matrix[double] q, Matrix[double] r, double pp, double pq, 
+              double trust_delta_sq)
+    return (Matrix[double] new_z, double f_change)
+{
+    zz = sum (z * z);  pz = sum (p * z);
+    sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
+    tau_1 = (- pz + sq_root_d) / pp;
+    tau_2 = (- pz - sq_root_d) / pp;
+    zq = sum (z * q);  gp = sum (g * p);
+    f_extra = 0.5 * sum (z * (r + g));
+    f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
+    f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
+    if (f_change_1 < f_change_2) {
+        new_z = z + (tau_1 * p);
+        f_change = f_change_1;
+    }
+    else {
+        new_z = z + (tau_2 * p);
+        f_change = f_change_2;
+    }
+}
+
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX =
+    function (Matrix[double] X, double eps, int max_iter_CG)
+    return   (Matrix[double] w)
+{
+    w_X = t(colSums(X));
+    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+    eps_LS = eps * nrow(X);
+
+    # BEGIN LEAST SQUARES
+    
+    r_LS = - w_X;
+    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+    p_LS = - r_LS;
+    norm_r2_LS = sum (r_LS ^ 2);
+    i_LS = 0;
+    while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
+    {
+        q_LS = t(X) %*% X %*% p_LS;
+        q_LS = q_LS + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+    }
+    
+    # END LEAST SQUARES
+    
+    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
+
+
+round_to_print = function (double x_to_truncate)
+return (double mantissa, int eee)
+{
+    mantissa = 1.0;
+    eee = 0;
+    positive_infinity = 1.0 / 0.0;
+    x = abs (x_to_truncate);
+    if (x != x / 2.0) {
+        log_ten = log (10.0);
+        d_eee = round (log (x) / log_ten - 0.5);
+        mantissa = round (x * exp (log_ten * (4.0 - d_eee))) / 10000;
+        if (mantissa == 10.0) {
+            mantissa = 1.0;
+            d_eee = d_eee + 1;
+        }
+        if (x_to_truncate < 0.0) {
+            mantissa = - mantissa;
+        }
+        eee = 0;
+        pow_two = 1;
+        res_eee = abs (d_eee);
+        while (res_eee != 0.0) {
+            new_res_eee = round (res_eee / 2.0 - 0.3);
+            if (new_res_eee * 2.0 < res_eee) {
+                eee = eee + pow_two;
+            }
+            res_eee = new_res_eee;
+            pow_two = 2 * pow_two;
+        }
+        if (d_eee < 0.0) {
+            eee = - eee;
+        }
+    } else { mantissa = x_to_truncate; }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml
new file mode 100644
index 0000000..9bdfab0
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml
@@ -0,0 +1,243 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+num_centroids = $2
+num_runs = $3
+eps = $4;
+max_iter = $5;
+
+is_write_Y = 0;
+is_verbose = 0;
+avg_sample_size_per_centroid = 50;
+
+print ("BEGIN K-MEANS SCRIPT");
+print ("Reading X...");
+num_records   = nrow (X);
+num_features  = ncol (X);
+
+sumXsq = sum (X ^ 2);
+# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
+
+# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
+
+print ("Taking data samples for initialization...");
+
+[sample_maps, samples_vs_runs_map, sample_block_size] = 
+    get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
+
+is_row_in_samples = rowSums (sample_maps);
+X_samples = sample_maps %*% X;
+X_samples_sq_norms = rowSums (X_samples ^ 2);
+
+print ("Initializing the centroids for all runs...");
+All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);
+
+# We select centroids according to the k-Means++ heuristic applied to a sample of X
+# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
+# with the out-of-range X_sample positions in min_distances set to 0.0
+
+min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at random
+
+for (i in 1 : num_centroids)
+{
+    # "Matricize" and prefix-sum to compute the cumulative distribution function:
+    min_distances_matrix_form = 
+        matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
+    cdf_min_distances = cumsum (min_distances_matrix_form);
+    
+    # Select the i-th centroid in each sample as a random sample row id with
+    # probability ~ min_distances:
+    random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);  
+    threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
+    centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;
+    
+    # Place the selected centroids together, one per run, into a matrix:
+    centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
+    centroid_placer_raw = 
+        table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
+    centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
+    centroids = centroid_placer %*% X_samples;
+    
+    # Place the selected centroids into their appropriate slots in All_Centroids:
+    centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
+    centroid_placer_raw = 
+        table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
+    centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
+    All_Centroids = All_Centroids + centroid_placer %*% centroids;
+    
+    # Update min_distances to preserve the loop invariant:
+    distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
+              - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
+    if (i == 1) {
+        min_distances = is_row_in_samples * distances;
+    } else {
+        min_distances = min (min_distances, distances);
+}   }
+
+# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
+
+termination_code = matrix (0, rows = num_runs, cols = 1);
+final_wcss = matrix (0, rows = num_runs, cols = 1);
+num_iterations = matrix (0, rows = num_runs, cols = 1);
+
+print ("Performing k-means iterations for all runs...");
+
+parfor (run_index in 1 : num_runs, check = 0)
+{
+    C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
+    C_old = C;
+    iter_count = 0;
+    term_code = 0;
+    wcss = 0;
+
+    while (term_code == 0)
+    {
+        # Compute Euclidean squared distances from records (X rows) to centroids (C rows)
+        # without the C-independent term, then take the minimum for each record
+        D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+        minD = rowMins (D);
+        # Compute the current centroid-based within-cluster sum of squares (WCSS)
+        wcss_old = wcss;
+        wcss = sumXsq + sum (minD);
+        if (is_verbose == 1) {
+            if (iter_count == 0) {
+                print ("Run " + run_index + ", At Start-Up:  Centroid WCSS = " + wcss);
+            } else {
+                print ("Run " + run_index + ", Iteration " + iter_count + ":  Centroid WCSS = " + wcss
+                    + ";  Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
+        }   }
+        # Check if convergence or maximum iteration has been reached
+        if (wcss_old - wcss < eps * wcss & iter_count > 0) {
+            term_code = 1;  # Convergence is reached
+        } else {
+            if (iter_count >= max_iter) {
+                term_code = 2;  # Maximum iteration is reached
+            } else {
+                iter_count = iter_count + 1;
+                # Find the closest centroid for each record
+                P = (D <= minD);
+                # If some records belong to multiple centroids, share them equally
+                P = P / rowSums (P);
+                # Compute the column normalization factor for P
+                P_denom = colSums (P);
+                if (sum (P_denom <= 0.0) > 0) {
+                    term_code = 3;  # There is a "runaway" centroid with 0.0 denominator
+                } else {
+                    C_old = C;
+                    # Compute new centroids as weighted averages over the records
+                    C = (t(P) %*% X) / t(P_denom);
+    }   }   }   }
+    print ("Run " + run_index + ", Iteration " + iter_count + ":  Terminated with code = " + term_code + ",  Centroid WCSS = " + wcss);
+    All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
+    final_wcss [run_index, 1] = wcss;
+    termination_code [run_index, 1] = term_code;
+    num_iterations [run_index, 1] = iter_count;
+}
+
+# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
+
+termination_bitmap = matrix (0, rows = num_runs, cols = 3);
+termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
+termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
+termination_stats = colSums (termination_bitmap);
+print ("Number of successful runs = " + as.integer (as.scalar (termination_stats [1, 1])));
+print ("Number of incomplete runs = " + as.integer (as.scalar (termination_stats [1, 2])));
+print ("Number of failed runs (with lost centroids) = " + as.integer (as.scalar (termination_stats [1, 3])));
+
+num_successful_runs = as.scalar (termination_stats [1, 1]);
+if (num_successful_runs > 0) {
+    final_wcss_successful = final_wcss * termination_bitmap [, 1];
+    worst_wcss = max (final_wcss_successful);
+    best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
+    avg_wcss = sum (final_wcss_successful) / num_successful_runs;
+    best_index_vector = (final_wcss_successful == best_wcss);
+    aggr_best_index_vector = cumsum (best_index_vector);
+    best_index = as.integer (sum (aggr_best_index_vector == 0) + 1);
+    print ("Successful runs:  Best run is " + best_index + " with Centroid WCSS = " + best_wcss 
+        + ";  Avg WCSS = " + avg_wcss + ";  Worst WCSS = " + worst_wcss);
+    C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
+    print ("Writing out the best-WCSS centroids...");
+    write (C, $6, format="text");
+    print ("DONE.");
+} else {
+    stop ("No output is produced.  Try increasing the number of iterations and/or runs.");
+}
+
+
+
+get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
+    return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
+{
+    if (approx_sample_size < num_records) {
+        # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
+        # to get the sample block size (to allocate space):
+        sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
+        num_rows = sample_block_size * num_samples;
+        
+        # Generate all samples in parallel by converting uniform random values into random
+        # integer skip-ahead intervals and prefix-summing them:
+        sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
+        sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
+        # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
+        sample_rec_ids = cumsum (sample_rec_ids);  #  (skip to next one) --> (skip to i-th one)
+        
+        # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
+        is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
+        sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range 
+                       + (num_records + 1) * (1 - is_sample_rec_id_within_range);
+        
+        # Rearrange all samples (and their out-of-range indicators) into one column-vector:
+        sample_rec_ids = 
+            matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
+        is_row_in_samples = 
+            matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);
+
+        # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
+        # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
+        sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
+        max_rec_id = ncol (sample_maps_raw);
+        if (max_rec_id >= num_records) {
+            sample_maps = sample_maps_raw [, 1 : num_records];
+        } else {
+            sample_maps = matrix (0, rows = num_rows, cols = num_records);        
+            sample_maps [, 1 : max_rec_id] = sample_maps_raw;
+        }
+        
+        # Create a 0-1-matrix that maps each sample column ID into all row positions of the
+        # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
+        sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
+        # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
+        col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
+        sample_col_map = table (sample_positions, col_positions);
+        # Remove the out-of-sample-range positions by cutting off the last row:
+        sample_col_map = sample_col_map [1 : (num_rows), ];
+        
+    } else {
+        one_per_record = matrix (1, rows = num_records, cols = 1);
+        sample_block_size = num_records;
+        sample_maps    = matrix (0, rows = (num_records * num_samples), cols = num_records);
+        sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
+        for (i in 1:num_samples) {
+            sample_maps    [(num_records * (i - 1) + 1) : (num_records * i),  ] = diag (one_per_record);
+            sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
+}   }   }
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_L2SVM.R b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R
new file mode 100644
index 0000000..36e844e
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R
@@ -0,0 +1,98 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+Y = readMM(paste(args[1], "Y.mtx", sep=""));
+intercept = as.integer(args[2]);
+epsilon = as.double(args[3]);
+lambda = 0.001;
+maxiterations = as.integer(args[4]);
+
+check_min = min(Y)
+check_max = max(Y)
+num_min = sum(Y == check_min)
+num_max = sum(Y == check_max)
+if(num_min + num_max != nrow(Y)){ 
+	print("please check Y, it should contain only 2 labels") 
+}else{
+	if(check_min != -1 | check_max != +1) 
+		Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
+}
+
+N = nrow(X)
+D = ncol(X)
+
+if (intercept == 1) {
+	ones  = matrix(1,N,1)
+	X = cbind(X, ones);
+}
+
+num_rows_in_w = D
+if(intercept == 1){
+	num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, num_rows_in_w, 1)
+
+g_old = t(X) %*% Y
+s = g_old
+
+Xw = matrix(0,nrow(X),1)
+iter = 0
+continue = TRUE
+while(continue && iter < maxiterations){
+	t = 0
+	Xd = X %*% s
+	wd = lambda * sum(w * s)
+	dd = lambda * sum(s * s)
+	continue1 = TRUE
+	while(continue1){
+		tmp_Xw = Xw + t*Xd
+		out = 1 - Y * (tmp_Xw)
+		sv = which(out > 0)
+		g = wd + t*dd - sum(out[sv] * Y[sv] * Xd[sv])
+		h = dd + sum(Xd[sv] * Xd[sv])
+		t = t - g/h
+		continue1 = (g*g/h >= 1e-10)
+	}
+	
+	w = w + t*s
+	Xw = Xw + t*Xd
+		
+	out = 1 - Y * (X %*% w)
+	sv = which(out > 0)
+	obj = 0.5 * sum(out[sv] * out[sv]) + lambda/2 * sum(w * w)
+	g_new = t(X[sv,]) %*% (out[sv] * Y[sv]) - lambda * w
+	
+	print(paste("OBJ : ", obj))
+
+	continue = (t*sum(s * g_old) >= epsilon*obj)
+	
+	be = sum(g_new * g_new)/sum(g_old * g_old)
+	s = be * s + g_new
+	g_old = g_new
+	
+	iter = iter + 1
+}
+
+writeMM(as(w,"CsparseMatrix"), paste(args[5], "w", sep=""));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml
new file mode 100644
index 0000000..9a6a631
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml
@@ -0,0 +1,106 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+Y = read($2)
+intercept = $3;
+eps = $4;
+maxiter = $5;
+
+check_min = min(Y)
+check_max = max(Y)
+num_min = sum(ppred(Y, check_min, "=="))
+num_max = sum(ppred(Y, check_max, "=="))
+if(num_min + num_max != nrow(Y)) print("please check Y, it should contain only 2 labels")
+else{
+	if(check_min != -1 | check_max != +1) 
+		Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
+}
+
+epsilon = eps
+lambda = 0.001
+maxiterations = maxiter
+num_samples = nrow(X)
+dimensions = ncol(X)
+
+if (intercept == 1) {
+	ones  = matrix(1, rows=num_samples, cols=1)
+	X = append(X, ones);
+}
+
+num_rows_in_w = dimensions
+if(intercept == 1){
+	num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=1)
+
+g_old = t(X) %*% Y
+s = g_old
+
+Xw = matrix(0, rows=nrow(X), cols=1)
+debug_str = "# Iter, Obj"
+iter = 0
+continue = 1
+while(continue == 1 & iter < maxiterations)  {
+	# minimizing primal obj along direction s
+	step_sz = 0
+	Xd = X %*% s
+	wd = lambda * sum(w * s)
+	dd = lambda * sum(s * s)
+	continue1 = 1
+	while(continue1 == 1){
+		tmp_Xw = Xw + step_sz*Xd
+		out = 1 - Y * (tmp_Xw)
+		sv = ppred(out, 0, ">")
+		out = out * sv
+		g = wd + step_sz*dd - sum(out * Y * Xd)
+		h = dd + sum(Xd * sv * Xd)
+		step_sz = step_sz - g/h
+		if (g*g/h < 0.0000000001){
+			continue1 = 0
+		}
+	}
+	
+	#update weights
+	w = w + step_sz*s
+	Xw = Xw + step_sz*Xd
+	
+	out = 1 - Y * Xw
+	sv = ppred(out, 0, ">")
+	out = sv * out
+	obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
+	g_new = t(X) %*% (out * Y) - lambda * w
+	
+	print("OBJ = " + obj)	
+	tmp = sum(s * g_old)
+	if(step_sz*tmp < epsilon*obj){
+		continue = 0
+	}
+	
+	#non-linear CG step
+	be = sum(g_new * g_new)/sum(g_old * g_old)
+	s = be * s + g_new
+	g_old = g_new
+
+	iter = iter + 1
+}
+
+write(w, $6, format="text")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_LinregCG.R b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R
new file mode 100644
index 0000000..5dcad95
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R
@@ -0,0 +1,57 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = readMM(paste(args[1], "X.mtx", sep=""))
+y = readMM(paste(args[1], "y.mtx", sep=""))
+
+intercept = as.integer(args[2]);
+eps = as.double(args[3]);
+maxiter = as.double(args[4]);
+
+if( intercept == 1 ){
+   ones = matrix(1, nrow(X), 1); 
+   X = cbind(X, ones);
+}
+
+r = -(t(X) %*% y);
+p = -r;
+norm_r2 = sum(r * r);
+w = matrix(0, ncol(X), 1);
+
+i = 0;
+while(i < maxiter) {
+	q = ((t(X) %*% (X %*% p)) + eps  * p);
+	alpha = norm_r2 / ((t(p) %*% q)[1:1]);
+	w = w + alpha * p;
+	old_norm_r2 = norm_r2;
+	r = r + alpha * q;
+	norm_r2 = sum(r * r);
+	beta = norm_r2 / old_norm_r2;
+	p = -r + beta * p;
+	i = i + 1;
+}
+
+writeMM(as(w,"CsparseMatrix"), paste(args[5], "w", sep=""))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml
new file mode 100644
index 0000000..92f15d7
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+X = read($1);
+y = read($2);
+intercept = $3;
+eps = $4;
+maxiter = $5;
+
+if( intercept == 1 ){
+   ones = matrix(1, nrow(X), 1); 
+   X = append(X, ones);
+}
+
+r = -(t(X) %*% y);
+p = -r;
+norm_r2 = sum(r * r);
+w = matrix(0, rows = ncol(X), cols = 1);
+
+i = 0;
+while(i < maxiter) {
+	q = ((t(X) %*% (X %*% p)) + eps  * p);
+	alpha = norm_r2 / as.scalar(t(p) %*% q);
+	w = w + alpha * p;
+	old_norm_r2 = norm_r2;
+	r = r + alpha * q;
+	norm_r2 = sum(r * r);
+	beta = norm_r2 / old_norm_r2;
+	p = -r + beta * p;
+	i = i + 1;
+}
+
+write(w, $6);
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_MLogreg.R b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R
new file mode 100644
index 0000000..121aba7
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R
@@ -0,0 +1,278 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+library("matrixStats")
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+Y_vec = readMM(paste(args[1], "Y.mtx", sep=""));
+intercept = as.integer(args[2]);
+tol = as.double(args[3]);
+maxiter = as.integer(args[4]);
+
+intercept_status = intercept;
+regularization = 0.001;
+maxinneriter = 0;
+
+print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
+
+eta0 = 0.0001;
+eta1 = 0.25;
+eta2 = 0.75;
+sigma1 = 0.25;
+sigma2 = 0.5;
+sigma3 = 4.0;
+psi = 0.1;
+
+N = nrow (X);
+D = ncol (X);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = cbind (X, matrix (1, N, 1));
+    D = ncol (X);
+}
+
+scale_lambda = matrix (1, D, 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [D, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
+    avg_X_cols = t(colSums(X)) / N;
+    var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
+    is_unsafe = (var_X_cols <= 0.0);
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [D, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [D, 1] = 0;
+    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+    scale_X = matrix (1, D, 1);
+    shift_X = matrix (0, D, 1);
+    rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
+
+# Convert "Y_vec" into indicator matrice:
+if (min (Y_vec) <= 0) { 
+    # Category labels "0", "-1" etc. are converted into the largest label
+    max_y = max (Y_vec);
+    Y_vec  = Y_vec  + (- Y_vec  + max_y + 1) * (Y_vec <= 0.0);
+}
+Y = table (seq (1, N, 1), as.vector(Y_vec));
+Y = as.matrix(as.data.frame.matrix(Y)) #this is required due to different table semantics
+
+K = ncol (Y) - 1;   # The number of  non-baseline categories
+
+lambda = (scale_lambda %*% matrix (1, 1, K)) * regularization;
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
+
+B = matrix (0, D, K);     ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
+                                        ### LT = append (LT, matrix (0, rows = N, cols = 1));
+                                        ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+P = matrix (1, N, K+1);   ### exp_LT = exp (LT);
+P = P / (K + 1);                        ### P =  exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+obj = N * log (K + 1);                  ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+
+Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+if (intercept_status == 2) {
+    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+}
+Grad = Grad + lambda * B;
+norm_Grad = sqrt (sum (Grad ^ 2));
+norm_Grad_initial = norm_Grad;
+
+if (maxinneriter == 0) {
+    maxinneriter = D * K;
+}
+iter = 1;
+
+# boolean for convergence check
+converge = (norm_Grad < tol) | (iter > maxiter);
+
+print (paste("-- Initially:  Objective = ", obj, ",  Gradient Norm = ", norm_Grad , ",  Trust Delta = " , delta));
+
+while (! converge)
+{
+	# SOLVE TRUST REGION SUB-PROBLEM
+	S = matrix (0, D, K);
+	R = - Grad;
+	V = R;
+	delta2 = delta ^ 2;
+	inneriter = 1;
+	norm_R2 = sum (R ^ 2);
+	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+	is_trust_boundary_reached = 0;
+
+	while (! innerconverge)
+	{
+	    if (intercept_status == 2) {
+	        ssX_V = diag (scale_X) %*% V;
+	        ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
+	    } else {
+	        ssX_V = V;
+	    }
+        Q = P [, 1:K] * (X %*% ssX_V);
+        HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, 1, K)));
+        if (intercept_status == 2) {
+            HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
+        }
+        HV = HV + lambda * V;
+		alpha = norm_R2 / sum (V * HV);
+		Snew = S + alpha * V;
+		norm_Snew2 = sum (Snew ^ 2);
+		if (norm_Snew2 <= delta2)
+		{
+			S = Snew;
+			R = R - alpha * HV;
+			old_norm_R2 = norm_R2 
+			norm_R2 = sum (R ^ 2);
+			V = R + (norm_R2 / old_norm_R2) * V;
+			innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+		} else {
+	        is_trust_boundary_reached = 1;
+			sv = sum (S * V);
+			v2 = sum (V ^ 2);
+			s2 = sum (S ^ 2);
+			rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
+			if (sv >= 0) {
+				alpha = (delta2 - s2) / (sv + rad);
+			} else {
+				alpha = (rad - sv) / v2;
+			}
+			S = S + alpha * V;
+			R = R - alpha * HV;
+			innerconverge = TRUE;
+		}
+	    inneriter = inneriter + 1;
+	    innerconverge = innerconverge | (inneriter > maxinneriter);
+	}  
+	
+	# END TRUST REGION SUB-PROBLEM
+	
+	# compute rho, update B, obtain delta
+	gs = sum (S * Grad);
+	qk = - 0.5 * (gs - sum (S * R));
+	B_new = B + S;
+	if (intercept_status == 2) {
+	    ssX_B_new = diag (scale_X) %*% B_new;
+	    ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
+    } else {
+        ssX_B_new = B_new;
+    }
+    
+    LT = as.matrix(cbind ((X %*% ssX_B_new), matrix (0, N, 1)));
+    LT = LT - rowMaxs (LT) %*% matrix (1, 1, K+1);
+    exp_LT = exp (LT);
+    P_new  = exp_LT / (rowSums (exp_LT) %*% matrix (1, 1, K+1));
+    obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+    	
+	# Consider updating LT in the inner loop
+	# Consider the big "obj" and "obj_new" rounding-off their small difference below:
+
+	actred = (obj - obj_new);
+	
+	rho = actred / qk;
+	is_rho_accepted = (rho > eta0);
+	snorm = sqrt (sum (S ^ 2));
+
+	if (iter == 1) {
+	   delta = min (delta, snorm);
+	}
+
+	alpha2 = obj_new - obj - gs;
+	if (alpha2 <= 0) {
+	   alpha = sigma3;
+	} 
+	else {
+	   alpha = max (sigma1, -0.5 * gs / alpha2);
+	}
+	
+	if (rho < eta0) {
+		delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
+	}
+	else {
+		if (rho < eta1) {
+			delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
+		}
+		else { 
+			if (rho < eta2) {
+				delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
+			}
+			else {
+				delta = max (delta, min (alpha * snorm, sigma3 * delta));
+			}
+		}
+	} 
+	
+	if (is_trust_boundary_reached == 1)
+	{
+	    print (paste("-- Outer Iteration " , iter , ": Had " , (inneriter - 1) , " CG iterations, trust bound REACHED"));
+	} else {
+	    print (paste("-- Outer Iteration " , iter , ": Had " , (inneriter - 1) , " CG iterations"));
+	}
+	print (paste("   -- Obj.Reduction:  Actual = " , actred , ",  Predicted = " , qk , 
+	       "  (A/P: " , (round (10000.0 * rho) / 10000.0) , "),  Trust Delta = " , delta));
+	       
+	if (is_rho_accepted)
+	{
+		B = B_new;
+		P = P_new;
+		Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+		if (intercept_status == 2) {
+		    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+		}
+		Grad = Grad + lambda * B;
+		norm_Grad = sqrt (sum (Grad ^ 2));
+		obj = obj_new;
+	    print (paste("   -- New Objective = " , obj , ",  Beta Change Norm = " , snorm , ",  Gradient Norm = " , norm_Grad));
+	} 
+	
+	iter = iter + 1;
+	converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
+	    ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
+    if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
+} 
+
+if (intercept_status == 2) {
+    B_out = diag (scale_X) %*% B;
+    B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
+} else {
+    B_out = B;
+}
+
+writeMM(as(B_out,"CsparseMatrix"), paste(args[5], "w", sep=""));

[6/9] incubator-systemml git commit: [SYSTEMML-1286] Code generator compiler integration, incl tests

Posted by mb...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_MLogreg.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_MLogreg.dml b/src/test/scripts/functions/codegen/Algorithm_MLogreg.dml
new file mode 100644
index 0000000..88c05d9
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_MLogreg.dml
@@ -0,0 +1,274 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+Y_vec = read($2)
+intercept = $3;
+tol = $4;
+maxiter = $5;
+
+intercept_status = intercept;
+regularization = 0.001;
+maxinneriter = 0;
+
+print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT");
+
+eta0 = 0.0001;
+eta1 = 0.25;
+eta2 = 0.75;
+sigma1 = 0.25;
+sigma2 = 0.5;
+sigma3 = 4.0;
+psi = 0.1;
+
+N = nrow (X);
+D = ncol (X);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = append (X, matrix (1, rows = N, cols = 1));
+    D = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = D, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [D, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, D] = matrix (1, rows = N, cols = 1)
+    avg_X_cols = t(colSums(X)) / N;
+    var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1);
+    is_unsafe = ppred (var_X_cols, 0.0, "<=");
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [D, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [D, 1] = 0;
+    rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
+} else {
+    scale_X = matrix (1, rows = D, cols = 1);
+    shift_X = matrix (0, rows = D, cols = 1);
+    rowSums_X_sq = rowSums (X ^ 2);
+}
+
+# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
+# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
+# The transform is then associatively applied to the other side of the expression,
+# and is rewritten via "scale_X" and "shift_X" as follows:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ];
+
+# Convert "Y_vec" into indicator matrice:
+max_y = max (Y_vec);
+if (min (Y_vec) <= 0) { 
+    # Category labels "0", "-1" etc. are converted into the largest label
+    Y_vec  = Y_vec  + (- Y_vec  + max_y + 1) * (Y_vec <= 0);
+    max_y = max_y + 1;
+}
+Y = table (seq (1, N, 1), Y_vec, N, max_y);
+K = ncol (Y) - 1;   # The number of  non-baseline categories
+
+
+lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization;
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq));
+
+B = matrix (0, rows = D, cols = K);     ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B;
+                                        ### LT = append (LT, matrix (0, rows = N, cols = 1));
+                                        ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+P = matrix (1, rows = N, cols = K+1);   ### exp_LT = exp (LT);
+P = P / (K + 1);                        ### P =  exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+obj = N * log (K + 1);                  ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+
+Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+if (intercept_status == 2) {
+    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+}
+Grad = Grad + lambda * B;
+norm_Grad = sqrt (sum (Grad ^ 2));
+norm_Grad_initial = norm_Grad;
+
+if (maxinneriter == 0) {
+    maxinneriter = D * K;
+}
+iter = 1;
+
+# boolean for convergence check
+converge = (norm_Grad < tol) | (iter > maxiter);
+
+print ("-- Initially:  Objective = " + obj + ",  Gradient Norm = " + norm_Grad + ",  Trust Delta = " + delta);
+
+while (! converge)
+{
+	# SOLVE TRUST REGION SUB-PROBLEM
+	S = matrix (0, rows = D, cols = K);
+	R = - Grad;
+	V = R;
+	delta2 = delta ^ 2;
+	inneriter = 1;
+	norm_R2 = sum (R ^ 2);
+	innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+	is_trust_boundary_reached = 0;
+
+	while (! innerconverge)
+	{
+	    if (intercept_status == 2) {
+	        ssX_V = diag (scale_X) %*% V;
+	        ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V;
+	    } else {
+	        ssX_V = V;
+	    }
+        Q = P [, 1:K] * (X %*% ssX_V);
+        HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K)));
+        if (intercept_status == 2) {
+            HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ];
+        }
+        HV = HV + lambda * V;
+		alpha = norm_R2 / sum (V * HV);
+		Snew = S + alpha * V;
+		norm_Snew2 = sum (Snew ^ 2);
+		if (norm_Snew2 <= delta2)
+		{
+			S = Snew;
+			R = R - alpha * HV;
+			old_norm_R2 = norm_R2 
+			norm_R2 = sum (R ^ 2);
+			V = R + (norm_R2 / old_norm_R2) * V;
+			innerconverge = (sqrt (norm_R2) <= psi * norm_Grad);
+		} else {
+	        is_trust_boundary_reached = 1;
+			sv = sum (S * V);
+			v2 = sum (V ^ 2);
+			s2 = sum (S ^ 2);
+			rad = sqrt (sv ^ 2 + v2 * (delta2 - s2));
+			if (sv >= 0) {
+				alpha = (delta2 - s2) / (sv + rad);
+			} else {
+				alpha = (rad - sv) / v2;
+			}
+			S = S + alpha * V;
+			R = R - alpha * HV;
+			innerconverge = TRUE;
+		}
+	    inneriter = inneriter + 1;
+	    innerconverge = innerconverge | (inneriter > maxinneriter);
+	}  
+	
+	# END TRUST REGION SUB-PROBLEM
+	
+	# compute rho, update B, obtain delta
+	gs = sum (S * Grad);
+	qk = - 0.5 * (gs - sum (S * R));
+	B_new = B + S;
+	if (intercept_status == 2) {
+	    ssX_B_new = diag (scale_X) %*% B_new;
+	    ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new;
+    } else {
+        ssX_B_new = B_new;
+    }
+    
+    LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1));
+    LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1);
+    exp_LT = exp (LT);
+    P_new  = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1));
+    obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2));
+    	
+	# Consider updating LT in the inner loop
+	# Consider the big "obj" and "obj_new" rounding-off their small difference below:
+
+	actred = (obj - obj_new);
+	
+	rho = actred / qk;
+	is_rho_accepted = (rho > eta0);
+	snorm = sqrt (sum (S ^ 2));
+
+	if (iter == 1) {
+	   delta = min (delta, snorm);
+	}
+
+	alpha2 = obj_new - obj - gs;
+	if (alpha2 <= 0) {
+	   alpha = sigma3;
+	} 
+	else {
+	   alpha = max (sigma1, -0.5 * gs / alpha2);
+	}
+	
+	if (rho < eta0) {
+		delta = min (max (alpha, sigma1) * snorm, sigma2 * delta);
+	}
+	else {
+		if (rho < eta1) {
+			delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta));
+		}
+		else { 
+			if (rho < eta2) {
+				delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta));
+			}
+			else {
+				delta = max (delta, min (alpha * snorm, sigma3 * delta));
+			}
+		}
+	} 
+	
+	if (is_trust_boundary_reached == 1)
+	{
+	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED");
+	} else {
+	    print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations");
+	}
+	print ("   -- Obj.Reduction:  Actual = " + actred + ",  Predicted = " + qk + 
+	       "  (A/P: " + (round (10000.0 * rho) / 10000.0) + "),  Trust Delta = " + delta);
+	       
+	if (is_rho_accepted)
+	{
+		B = B_new;
+		P = P_new;
+		Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]);
+		if (intercept_status == 2) {
+		    Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ];
+		}
+		Grad = Grad + lambda * B;
+		norm_Grad = sqrt (sum (Grad ^ 2));
+		obj = obj_new;
+	    print ("   -- New Objective = " + obj + ",  Beta Change Norm = " + snorm + ",  Gradient Norm = " + norm_Grad);
+	} 
+	
+	iter = iter + 1;
+	converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) |
+	    ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001)));
+    if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); }
+} 
+
+if (intercept_status == 2) {
+    B_out = diag (scale_X) %*% B;
+    B_out [D, ] = B_out [D, ] + t(shift_X) %*% B;
+} else {
+    B_out = B;
+}
+write (B_out, $6);
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_MSVM.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_MSVM.R b/src/test/scripts/functions/codegen/Algorithm_MSVM.R
new file mode 100644
index 0000000..52a898b
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_MSVM.R
@@ -0,0 +1,133 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+Y = readMM(paste(args[1], "Y.mtx", sep=""));
+intercept = as.integer(args[2]);
+epsilon = as.double(args[3]);
+lambda = 0.001;
+max_iterations = as.integer(args[4]);
+
+
+if(nrow(X) < 2)
+	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
+
+lambda = 0.001
+num_samples = nrow(X)
+dimensions = nrow(X)
+num_features = ncol(X)
+
+min_y = min(Y)
+num_classes = max(Y)
+mod1 = Y %% 1
+mod1_should_be_nrow = sum(abs(mod1==0))
+	
+
+if (intercept == 1) {
+	ones  = matrix(1, num_samples, 1);
+	X = append(X, ones);
+}
+
+num_rows_in_w = num_features
+if(intercept == 1){
+	num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, num_rows_in_w, num_classes)
+
+debug_mat = matrix(-1, max_iterations, num_classes)
+for(iter_class in 1:num_classes){		  
+	Y_local = 2 * (Y == iter_class) - 1
+	w_class = matrix(0, num_features, 1)
+	if (intercept == 1) {
+		zero_matrix = matrix(0, 1, 1);
+		w_class = t(append(t(w_class), zero_matrix));
+	}
+ 
+	g_old = t(X) %*% Y_local
+	s = g_old
+
+	Xw = matrix(0, nrow(X), 1)
+	iter = 0
+	continue = 1
+	while(continue == 1)  {
+		# minimizing primal obj along direction s
+ 		step_sz = 0
+ 		Xd = X %*% s
+ 		wd = lambda * sum(w_class * s)
+		dd = lambda * sum(s * s)
+		continue1 = 1
+		while(continue1 == 1){
+ 			tmp_Xw = Xw + step_sz*Xd
+ 			out = 1 - Y_local * (tmp_Xw)
+ 			sv = (out > 0)
+ 			out = out * sv
+ 			g = wd + step_sz*dd - sum(out * Y_local * Xd)
+ 			h = dd + sum(Xd * sv * Xd)
+ 			step_sz = step_sz - g/h
+ 			if (g*g/h < 0.0000000001){
+			continue1 = 0
+		}
+	}
+ 
+		#update weights
+		w_class = w_class + step_sz*s
+		Xw = Xw + step_sz*Xd
+ 
+		out = 1 - Y_local * Xw
+		sv = (out > 0)
+		out = sv * out
+		obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+  		g_new = t(X) %*% (out * Y_local) - lambda * w_class
+
+  		tmp = sum(s * g_old)
+  
+  		train_acc = sum( (Y_local*(X%*%w_class))>= 0)/num_samples*100
+  		print(paste("For class " , iter_class , " iteration " , iter , " training accuracy: " , train_acc))
+  		debug_mat[iter+1,iter_class] = obj	   
+   
+  		if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
+   			continue = 0
+  		}
+ 
+  		#non-linear CG step
+  		be = sum(g_new * g_new)/sum(g_old * g_old)
+  		s = be * s + g_new
+  		g_old = g_new
+
+		if(sum(s^2) == 0){
+	    	continue = 0
+		}
+
+  		iter = iter + 1
+ 	}
+
+  w[,iter_class] = as.matrix(w_class)
+}
+
+extra_model_params = matrix(0, 2, ncol(w))
+extra_model_params[1, 1] = intercept
+extra_model_params[2, 1] = dimensions
+w = t(cbind(t(w), t(extra_model_params)))
+
+writeMM(as(w,"CsparseMatrix"), paste(args[5], "w", sep=""));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_MSVM.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_MSVM.dml b/src/test/scripts/functions/codegen/Algorithm_MSVM.dml
new file mode 100644
index 0000000..0ab739f
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_MSVM.dml
@@ -0,0 +1,150 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+Y = read($2)
+intercept = $3;
+eps = $4;
+maxiter = $5;
+
+if(nrow(X) < 2)
+	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
+
+epsilon = eps
+lambda = 0.001
+max_iterations = maxiter
+num_samples = nrow(X)
+dimensions = nrow(X)
+num_features = ncol(X)
+
+
+if(nrow(X) != nrow(Y))
+	stop("Stopping due to invalid argument: Numbers of rows in X and Y must match")
+
+if(intercept != 0 & intercept != 1)
+	stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
+
+min_y = min(Y)
+if(min_y < 1)
+	stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+num_classes = max(Y)
+if(num_classes == 1)
+	stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")	
+mod1 = Y %% 1
+mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
+if(mod1_should_be_nrow != nrow(Y))
+	stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
+	
+if(epsilon < 0)
+	stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
+
+if(lambda < 0)
+	stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
+
+if(max_iterations < 1)
+	stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
+
+if (intercept == 1) {
+	ones  = matrix(1, rows=num_samples, cols=1);
+	X = append(X, ones);
+}
+
+num_rows_in_w = num_features
+if(intercept == 1){
+	num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=num_classes)
+
+debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+parfor(iter_class in 1:num_classes){		  
+	Y_local = 2 * ppred(Y, iter_class, "==") - 1
+	w_class = matrix(0, rows=num_features, cols=1)
+	if (intercept == 1) {
+		zero_matrix = matrix(0, rows=1, cols=1);
+		w_class = t(append(t(w_class), zero_matrix));
+	}
+ 
+	g_old = t(X) %*% Y_local
+	s = g_old
+
+	Xw = matrix(0, rows=nrow(X), cols=1)
+	iter = 0
+	continue = 1
+	while(continue == 1)  {
+		# minimizing primal obj along direction s
+ 		step_sz = 0
+ 		Xd = X %*% s
+ 		wd = lambda * sum(w_class * s)
+		dd = lambda * sum(s * s)
+		continue1 = 1
+		while(continue1 == 1){
+ 			tmp_Xw = Xw + step_sz*Xd
+ 			out = 1 - Y_local * (tmp_Xw)
+ 			sv = ppred(out, 0, ">")
+ 			out = out * sv
+ 			g = wd + step_sz*dd - sum(out * Y_local * Xd)
+ 			h = dd + sum(Xd * sv * Xd)
+ 			step_sz = step_sz - g/h
+ 			if (g*g/h < 0.0000000001){
+			continue1 = 0
+		}
+	}
+ 
+		#update weights
+		w_class = w_class + step_sz*s
+		Xw = Xw + step_sz*Xd
+ 
+		out = 1 - Y_local * Xw
+		sv = ppred(out, 0, ">")
+		out = sv * out
+		obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+  		g_new = t(X) %*% (out * Y_local) - lambda * w_class
+
+  		tmp = sum(s * g_old)
+  
+  		train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100
+  		print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
+  		debug_mat[iter+1,iter_class] = obj	   
+   
+  		if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
+   			continue = 0
+  		}
+ 
+  		#non-linear CG step
+  		be = sum(g_new * g_new)/sum(g_old * g_old)
+  		s = be * s + g_new
+  		g_old = g_new
+
+		if(sum(s^2) == 0){
+	    	continue = 0
+		}
+
+  		iter = iter + 1
+ 	}
+
+	w[,iter_class] = w_class
+}
+
+extra_model_params = matrix(0, rows=2, cols=ncol(w))
+extra_model_params[1, 1] = intercept
+extra_model_params[2, 1] = dimensions
+w = t(append(t(w), t(extra_model_params)))
+write(w, $6, format="text")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_PNMF.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_PNMF.R b/src/test/scripts/functions/codegen/Algorithm_PNMF.R
new file mode 100644
index 0000000..a2fbb57
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_PNMF.R
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+library("Matrix")
+
+X = readMM(paste(args[1], "X.mtx", sep=""));
+W = readMM(paste(args[1], "W.mtx", sep=""));
+H = readMM(paste(args[1], "H.mtx", sep=""));
+
+k = as.integer(args[2]);
+eps = as.double(args[3]);
+max_iter = as.integer(args[4]);
+iter = 1;
+
+while( iter < max_iter ) {
+   H = (H*(t(W)%*%(X/(W%*%H+eps)))) / (colSums(W)%*%matrix(1,1,ncol(H)));
+   W = (W*((X/(W%*%H+eps))%*%t(H))) / (matrix(1,nrow(W),1)%*%t(rowSums(H)));
+   obj = sum(W%*%H) - sum(X*log(W%*%H+eps));
+   print(paste("obj=", obj))
+   iter = iter + 1;
+}
+
+writeMM(as(W,"CsparseMatrix"), paste(args[5], "W", sep=""));
+writeMM(as(H,"CsparseMatrix"), paste(args[5], "H", sep=""));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_PNMF.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/Algorithm_PNMF.dml b/src/test/scripts/functions/codegen/Algorithm_PNMF.dml
new file mode 100644
index 0000000..641cc09
--- /dev/null
+++ b/src/test/scripts/functions/codegen/Algorithm_PNMF.dml
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+W = read($2);
+H = read($3);
+
+k = $4; 
+eps = $5; 
+max_iter = $6;
+iter = 1;
+
+while( iter < max_iter ) {
+   H = (H*(t(W)%*%(X/(W%*%H+eps)))) / t(colSums(W));
+   W = (W*((X/(W%*%H+eps))%*%t(H))) / t(rowSums(H));
+   obj = sum(W%*%H) - sum(X*log(W%*%H+eps));
+   print("iter=" + iter + " obj=" + obj);
+   iter = iter + 1;
+}
+
+write(W, $7);
+write(H, $8);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl1.R b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.R
new file mode 100644
index 0000000..21c70b5
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.R
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+N = 2000;
+M = as.integer(args[1]);
+X = matrix( seq(1,N*M), N, M, byrow=TRUE)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = B + C;
+
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl1.dml b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.dml
new file mode 100644
index 0000000..0a9062e
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl1.dml
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+N = 2000;
+M = $1;
+X = matrix( seq(1,N*M), rows=N, cols=M)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = B + C;
+
+write(S, $2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl2.R b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.R
new file mode 100644
index 0000000..90d4d96
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.R
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+N = 2000;
+M = as.integer(args[1]);
+X = matrix( seq(1,N*M), N, M, byrow=TRUE)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = as.matrix(sum(B * C));
+
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl2.dml b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.dml
new file mode 100644
index 0000000..8bcc462
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl2.dml
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+N = 2000;
+M = $1;
+X = matrix( seq(1,N*M), rows=N, cols=M)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = as.matrix(sum(B * C));
+
+write(S, $2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl3.R b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.R
new file mode 100644
index 0000000..d052e3e
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.R
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+N = 2000;
+M = as.integer(args[1]);
+X = matrix( seq(1,N*M), N, M, byrow=TRUE)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = rowSums(B * C);
+
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/DAGcellwisetmpl3.dml b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.dml
new file mode 100644
index 0000000..287abe7
--- /dev/null
+++ b/src/test/scripts/functions/codegen/DAGcellwisetmpl3.dml
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+N = 2000;
+M = $1;
+X = matrix( seq(1,N*M), rows=N, cols=M)
+
+A = (X * 7 + 6) * 5 + 4;
+B = A + 1;
+C = A + 2;
+S = rowSums(B * C);
+
+write(S, $2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/SystemML-config-codegen.xml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/SystemML-config-codegen.xml b/src/test/scripts/functions/codegen/SystemML-config-codegen.xml
new file mode 100644
index 0000000..5d623ae
--- /dev/null
+++ b/src/test/scripts/functions/codegen/SystemML-config-codegen.xml
@@ -0,0 +1,61 @@
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+-->
+
+<root>
+   <!-- local fs tmp working directory-->
+   <localtmpdir>/tmp/systemml</localtmpdir>
+
+   <!-- hdfs tmp working directory--> 
+   <scratch>scratch_space</scratch> 
+
+   <!-- compiler optimization level, valid values: 0 | 1 | 2 | 3 | 4, default: 2 -->
+   <optlevel>7</optlevel>  
+
+   <!-- default number of reduce tasks per MR job, default: 2 x number of nodes -->
+   <numreducers>10</numreducers> 
+   
+   <!-- override jvm reuse flag for specific MR jobs, valid values: true | false  -->
+   <jvmreuse>false</jvmreuse> 
+
+   <!-- default block dim for binary block files -->
+   <defaultblocksize>1000</defaultblocksize> 
+
+   <!-- run systemml control program as yarn appmaster, in case of MR1 always falls back to client, please disable for debug mode -->
+   <dml.yarn.appmaster>false</dml.yarn.appmaster>
+
+   <!-- maximum jvm heap size of the dml yarn appmaster in MB, the requested memory is 1.5x this parameter -->
+   <dml.yarn.appmaster.mem>2048</dml.yarn.appmaster.mem>
+
+   <!-- maximum jvm heap size of the map/reduce tasks in MB, the requested memory is 1.5x this parameter, negative values ignored  -->
+   <dml.yarn.mapreduce.mem>2048</dml.yarn.mapreduce.mem>
+
+   <!-- yarn application submission queue, relevant for default capacity scheduler -->
+   <dml.yarn.app.queue>default</dml.yarn.app.queue>
+   
+   <!-- enables multi-threaded matrix multiplications in singlenode control program -->
+   <cp.parallel.matrixmult>true</cp.parallel.matrixmult>
+   
+   <!-- enables multi-threaded read/write of text formats in singlenode control program -->
+   <cp.parallel.textio>true</cp.parallel.textio>
+   
+   <!-- enables automatic code generation -->
+   <codegen.enabled>true</codegen.enabled>
+   <codegen.plancache>true</codegen.plancache>
+   <codegen.literals>1</codegen.literals>
+</root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl1.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl1.R b/src/test/scripts/functions/codegen/cellwisetmpl1.R
new file mode 100644
index 0000000..1c306b0
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl1.R
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( 1, 100, 100)
+#X = matrix(  c(1,2,3,4,5,6,7,8,9), nrow=3, ncol=3, byrow = TRUE)
+#X= matrix(  c(0,0,3,4,0,0,0,8,0),  nrow=3, ncol=3, byrow = TRUE)
+#Y= matrix( c(2,2,2,3,3,3,1,1,1), nrow=3, ncol=3, byrow = TRUE)
+#X= matrix(1, 1001, 1001)
+
+X= matrix( seq(1,4000000), 2000,2000, byrow=TRUE)
+#X= matrix(1, 2000,2000, byrow=TRUE)
+
+Y= matrix( 2, 2000, 2000)
+#S= X*(1-X)
+lamda = 4000
+
+S=round(abs(X+lamda))+5
+#S=sum(X+Y+5)
+#S=round(X+(X+9.5))
+#print(S)
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl1.dml b/src/test/scripts/functions/codegen/cellwisetmpl1.dml
new file mode 100644
index 0000000..f646c15
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl1.dml
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( seq(1,4000000), rows=2000, cols=2000)
+Y= matrix( 2, rows=2000, cols=1)
+
+lamda = sum(Y)
+S=round(abs(X+lamda))+5
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl2.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl2.R b/src/test/scripts/functions/codegen/cellwisetmpl2.R
new file mode 100644
index 0000000..f48d9e6
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl2.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( 1,100, 100)
+#X = matrix(  c(1,2,3,4,5,6,7,8,9), nrow=3, ncol=3, byrow = TRUE)
+#X= matrix( "0 0 3 4 0 0 0 8 0", 3, 3)
+X= matrix( 1, 10, 10)
+S= 1/(1+exp(-X))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl2.dml b/src/test/scripts/functions/codegen/cellwisetmpl2.dml
new file mode 100644
index 0000000..c84a987
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl2.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( 1, rows=10, cols=10)
+S= 1/(1+exp(-X))
+write(S,$1)
+
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl3.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl3.R b/src/test/scripts/functions/codegen/cellwisetmpl3.R
new file mode 100644
index 0000000..43253aa
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl3.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( seq(1,25), 5, 5, byrow = TRUE)
+X = matrix(  c(1,2,3,4,5,6,7,8,9), nrow=3, ncol=3, byrow = TRUE)
+
+#S= X*as.matrix(X>0)
+#S=7 + (1 / exp(X) )
+S = 10 + floor(round(abs(7 + (1 / exp(X) ))))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl3.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl3.dml b/src/test/scripts/functions/codegen/cellwisetmpl3.dml
new file mode 100644
index 0000000..4aa30eb
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl3.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9", rows=3, cols=3)
+S=10 + floor(round(abs(7 + (1 / exp(X) ))))
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl4.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl4.R b/src/test/scripts/functions/codegen/cellwisetmpl4.R
new file mode 100644
index 0000000..803904c
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl4.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( seq(1,25), 5, 5, byrow = TRUE)
+X = matrix(  c(1,2,3,4,5,6,7,8,9), nrow=3, ncol=3, byrow = TRUE)
+w=matrix(  c(3,3,3,3,3,3,3,3,3), nrow=3, ncol=3, byrow = TRUE)
+z=matrix(  c(5,5,5,5,5,5,5,5,5), nrow=3, ncol=3, byrow = TRUE)
+#S= X*as.matrix(X>0)
+#S=7 + (1 / exp(X) )
+S = 10 + floor(round(abs((X+w)*z)))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl4.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl4.dml b/src/test/scripts/functions/codegen/cellwisetmpl4.dml
new file mode 100644
index 0000000..58b0b58
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl4.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9", rows=3, cols=3)
+w=matrix( "3 3 3", rows=3, cols=1)
+z=matrix( "5 5 5", rows=3, cols=1)
+S=10 + floor(round(abs((X+w)*z)))
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl5.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl5.R b/src/test/scripts/functions/codegen/cellwisetmpl5.R
new file mode 100644
index 0000000..ae95111
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl5.R
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( seq(1,25), 5, 5, byrow = TRUE)
+X = matrix(  c(1,2,3,4,5,6,7,8,9), nrow=3, ncol=3, byrow = TRUE)
+w=matrix(  c(1,1,1,2,2,2,3,3,3), nrow=3, ncol=3, byrow = TRUE)
+z=matrix(  c(3,3,3,3,3,3,3,3,3), nrow=3, ncol=3, byrow = TRUE)
+#S= X*as.matrix(X>0)
+#S=7 + (1 / exp(X) )
+G = abs(exp(X))
+Y=10 + floor(round(abs((X/w)+z)))
+S = G + Y
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl5.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl5.dml b/src/test/scripts/functions/codegen/cellwisetmpl5.dml
new file mode 100644
index 0000000..c9f30ef
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl5.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9", rows=3, cols=3)
+w=matrix( "1 2 3", rows=3, cols=1)
+z=matrix( "3 3 3", rows=3, cols=1)
+
+G = abs(exp(X))
+Y=10 + floor(round(abs((X/w)+z)))
+S = G + Y
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl6.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl6.R b/src/test/scripts/functions/codegen/cellwisetmpl6.R
new file mode 100644
index 0000000..669e76f
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl6.R
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+#X= matrix( seq(1,25), 5, 5, byrow = TRUE)
+X = matrix(  c(1,2,3), nrow=3, ncol=1, byrow = TRUE)
+y=matrix(  c(1,1,1), nrow=3, ncol=1, byrow = TRUE)
+z=matrix(  c(3,3,3), nrow=3, ncol=1, byrow = TRUE)
+#S= X*as.matrix(X>0)
+#S=7 + (1 / exp(X) )
+S=sum(X*y*z)
+print(S)
+write(S,paste(args[2],"S",sep=""))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/cellwisetmpl6.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/cellwisetmpl6.dml b/src/test/scripts/functions/codegen/cellwisetmpl6.dml
new file mode 100644
index 0000000..7ff5124
--- /dev/null
+++ b/src/test/scripts/functions/codegen/cellwisetmpl6.dml
@@ -0,0 +1,54 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3", rows=3, cols=1)
+y=matrix( "1 1 1", rows=3, cols=1)
+z=matrix( "3 3 3", rows=3, cols=1)
+
+
+S = sum(X*y*z)
+print(S)
+write(S,$1)
+#S=10 + floor(round(abs((X+w)+z)))
+#G = abs(exp(X))
+
+#print(sum(G))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/codegenIntegration.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/codegenIntegration.R b/src/test/scripts/functions/codegen/codegenIntegration.R
new file mode 100644
index 0000000..7456c87
--- /dev/null
+++ b/src/test/scripts/functions/codegen/codegenIntegration.R
@@ -0,0 +1,45 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+X = matrix(  c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+v=matrix(1,3,1)
+
+###############Test0
+#lamda = sum(X)
+#S=t(X)%*%X%*%(lamda*v)
+#writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+
+###############Test1
+#lamda=sum(X)
+#S=t(X)%*%(lamda*(X%*%v))
+#writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+
+###############Test2
+#w=matrix(  c(1,2,3,4,5), nrow=5, ncol=1, byrow = TRUE)
+#S=t(X)%*%(w*(X%*%v))
+#writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+
+###############Test3
+S=colSums(X/rowSums(X))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/codegenIntegration.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/codegenIntegration.dml b/src/test/scripts/functions/codegen/codegenIntegration.dml
new file mode 100644
index 0000000..e312fe0
--- /dev/null
+++ b/src/test/scripts/functions/codegen/codegenIntegration.dml
@@ -0,0 +1,67 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+v=matrix(1,rows=3,cols=1)
+###############Test 0
+
+#lamda = sum(X)
+#S=t(X)%*%(X%*%(lamda*v))
+#write(S,$1)
+
+###############Test 1
+#Y= matrix( "1 1 1 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+#lamda=sum(Y)
+#S=t(X)%*%(lamda*(X%*%v))
+#write(S,$1)
+
+###############Test 2 (need to update the current template)
+#w=matrix( "1 2 3 4 5", rows=5, cols=1)
+#z=matrix( "3 3 3 3 3", rows=5, cols=1)
+#S=t(X)%*%(w*(X%*%v))
+#write(S,$1)
+
+###############Test 3
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+S=colSums(X/rowSums(X))
+write(S,$1)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern1.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern1.R b/src/test/scripts/functions/codegen/rowAggPattern1.R
new file mode 100644
index 0000000..3657e4a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern1.R
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+X = matrix(  c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+v=matrix(1,3,1)
+lamda = sum(X)
+S=t(X)%*%X%*%(lamda*v)
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern1.dml b/src/test/scripts/functions/codegen/rowAggPattern1.dml
new file mode 100644
index 0000000..1d43211
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern1.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+v=matrix(1,rows=3,cols=1)
+lamda = sum(X)
+S=t(X)%*%(X%*%(lamda*v))
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern2.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern2.R b/src/test/scripts/functions/codegen/rowAggPattern2.R
new file mode 100644
index 0000000..1689593
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern2.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+X = matrix(  c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+v=matrix(1,3,1)
+Y= matrix( c(1,1,1,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+lamda=sum(Y)
+S=t(X)%*%(lamda*(X%*%v))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern2.dml b/src/test/scripts/functions/codegen/rowAggPattern2.dml
new file mode 100644
index 0000000..4007ae9
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern2.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+v=matrix(1,rows=3,cols=1)
+Y= matrix( "1 1 1 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+lamda=sum(Y)
+#lamda=2
+S=t(X)%*%(lamda*(X%*%v))
+write(S,$1)
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern3.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern3.R b/src/test/scripts/functions/codegen/rowAggPattern3.R
new file mode 100644
index 0000000..760620a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern3.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+X = matrix(  c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+v=matrix(1,3,1)
+w=matrix(  c(1,2,3,4,5), nrow=5, ncol=1, byrow = TRUE)
+z=matrix(  c(3,3,3,3,3), nrow=5, ncol=1, byrow = TRUE)
+
+S=t(X)%*%(z+(2-(w*(X%*%v))))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern3.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern3.dml b/src/test/scripts/functions/codegen/rowAggPattern3.dml
new file mode 100644
index 0000000..7fbfb87
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern3.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+v=matrix(1,rows=3,cols=1)
+w=matrix( "1 2 3 4 5", rows=5, cols=1)
+z=matrix( "3 3 3 3 3", rows=5, cols=1)
+
+S=t(X)%*%(z+(2-(w*(X%*%v))))
+write(S,$1)
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern4.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern4.R b/src/test/scripts/functions/codegen/rowAggPattern4.R
new file mode 100644
index 0000000..65774b4
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern4.R
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+X = matrix( c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), nrow=5, ncol=3, byrow = TRUE)
+S=t(colSums(X/rowSums(X)))
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/rowAggPattern4.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern4.dml b/src/test/scripts/functions/codegen/rowAggPattern4.dml
new file mode 100644
index 0000000..4c65efd
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern4.dml
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15", rows=5, cols=3)
+S=colSums(X/rowSums(X))
+write(S,$1)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wcemm.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wcemm.R b/src/test/scripts/functions/codegen/wcemm.R
new file mode 100644
index 0000000..f228db3
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wcemm.R
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+U= matrix( 1, 2000, 10)
+V= matrix( 2, 2000, 10)
+
+eps = 0.1
+S= sum(X*log(U%*%t(V)+eps))
+print(S)
+write(S, paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wcemm.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wcemm.dml b/src/test/scripts/functions/codegen/wcemm.dml
new file mode 100644
index 0000000..32ff880
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wcemm.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= read($2)
+U= matrix( 1, rows=2000, cols=10)
+V= matrix( 2, rows=2000, cols=10)
+if(1==1){}
+
+eps = 0.1
+S= sum(X*log(U%*%t(V)+eps))
+write(S,$1)
+print(S)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmm.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmm.R b/src/test/scripts/functions/codegen/wdivmm.R
new file mode 100644
index 0000000..37e2d44
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmm.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( 3, 20000,2000)
+U= matrix( 4, 20000,10)
+V= matrix( 5, 2000,10)
+eps = 0.1
+S= t(t(U) %*% (X/(U%*%t(V)+eps)));
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmm.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmm.dml b/src/test/scripts/functions/codegen/wdivmm.dml
new file mode 100644
index 0000000..dc030f4
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmm.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( 3, rows=20000, cols=2000)
+U= matrix( 4, rows=20000, cols=10)
+V= matrix( 5, rows=2000, cols=10)
+if(1==1){}
+eps = 0.1
+S= t(t(U) %*% (X/(U%*%t(V)+eps)))
+
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmRight.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmRight.R b/src/test/scripts/functions/codegen/wdivmmRight.R
new file mode 100644
index 0000000..cc3159a
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmRight.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( 3, 2000,2000)
+U= matrix( 4, 2000,10)
+V= matrix( 5, 2000,10)
+eps = 0.1
+S= (X/(U%*%t(V)))%*%V
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmRight.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmRight.dml b/src/test/scripts/functions/codegen/wdivmmRight.dml
new file mode 100644
index 0000000..488d744
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmRight.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( 3, rows=2000, cols=2000)
+U= matrix( 4, rows=2000, cols=10)
+V= matrix( 5, rows=2000, cols=10)
+
+if(1==1){}
+
+eps = 0.1
+S= (X/(U%*%t(V)))%*%V
+print(sum(S))
+write(S,$1)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmRightNotranspose.R b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.R
new file mode 100644
index 0000000..e541154
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( c(1,0,1,2,2,0,0,3,3,0,0,4), nrow=4, ncol=3, byrow = TRUE)
+U= matrix( c(1,2,3,4,5,6,7,8), nrow=4, ncol=2, byrow = TRUE)
+V= matrix( c(9,12,10,13,11,14), nrow=2, ncol=3, byrow = TRUE)
+eps = 0.1
+S= (X/((U%*%V)+eps))%*%t(V)
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmRightNotranspose.dml b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.dml
new file mode 100644
index 0000000..1938832
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmRightNotranspose.dml
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 0 1 2 2 0 0 3 3 0 0 4", rows=4, cols=3)
+U= matrix( "1 2 3 4 5 6 7 8", rows=4, cols=2)
+V= matrix( "9 12 10 13 11 14", rows=2, cols=3)
+
+if(1==1){}
+
+eps = 0.1
+S= (X/((U%*%V)+eps))%*%t(V)
+write(S,$1)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmTransposeOut.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmTransposeOut.R b/src/test/scripts/functions/codegen/wdivmmTransposeOut.R
new file mode 100644
index 0000000..ba1da27
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmTransposeOut.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( c(1,0,1,2,2,0,0,3,3,0,0,4), nrow=4, ncol=3, byrow = TRUE)
+U= matrix( c(1,2,3,4,5,6,7,8), nrow=4, ncol=2, byrow = TRUE)
+V= matrix( c(9,12,10,13,11,14), nrow=2, ncol=3, byrow = TRUE)
+eps = 0.1
+S= (t(U) %*% (X/((U%*%V)+eps)));
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

[4/9] incubator-systemml git commit: [SYSTEMML-1287] Code generator runtime integration

Posted by mb...@apache.org.

[SYSTEMML-1287] Code generator runtime integration 

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/982ecb1a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/982ecb1a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/982ecb1a

Branch: refs/heads/master
Commit: 982ecb1a4be69685a8e124eccfa3a12331f998b0
Parents: d7fd587
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Feb 26 19:01:36 2017 -0800
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Feb 26 19:01:36 2017 -0800

----------------------------------------------------------------------
 .../instructions/CPInstructionParser.java       |  19 +-
 .../instructions/SPInstructionParser.java       |  14 +-
 .../runtime/instructions/cp/CPInstruction.java  |   8 +-
 .../instructions/cp/SpoofCPInstruction.java     |  98 +++++
 .../instructions/spark/SPInstruction.java       |   2 +-
 .../instructions/spark/SpoofSPInstruction.java  | 407 +++++++++++++++++++
 .../spark/utils/RDDAggregateUtils.java          |   8 +-
 7 files changed, 541 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index f3c1605..f0603b4 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -61,6 +61,7 @@ import org.apache.sysml.runtime.instructions.cp.QuantileSortCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.QuaternaryCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.RelationalBinaryCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.ReorgCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.SpoofCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.StringInitCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.TernaryCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.UaggOuterChainCPInstruction;
@@ -271,8 +272,9 @@ public class CPInstructionParser extends InstructionParser
 		String2CPInstructionType.put( "lu",    CPINSTRUCTION_TYPE.MultiReturnBuiltin);
 		String2CPInstructionType.put( "eigen", CPINSTRUCTION_TYPE.MultiReturnBuiltin);
 		
-		String2CPInstructionType.put( "partition", CPINSTRUCTION_TYPE.Partition);
-		String2CPInstructionType.put( "compress", CPINSTRUCTION_TYPE.Compression);
+		String2CPInstructionType.put( "partition", 	CPINSTRUCTION_TYPE.Partition);
+		String2CPInstructionType.put( "compress", 	CPINSTRUCTION_TYPE.Compression);
+		String2CPInstructionType.put( "spoof", 		CPINSTRUCTION_TYPE.SpoofFused);
 		
 		//CP FILE instruction
 		String2CPFileInstructionType = new HashMap<String, CPINSTRUCTION_TYPE>();
@@ -424,16 +426,19 @@ public class CPInstructionParser extends InstructionParser
 			
 			case Partition:
 				return DataPartitionCPInstruction.parseInstruction(str);	
-	
-			case Compression:
-				return (CPInstruction) CompressionCPInstruction.parseInstruction(str);	
-				
+		
 			case CentralMoment:
 				return CentralMomentCPInstruction.parseInstruction(str);
 	
 			case Covariance:
 				return CovarianceCPInstruction.parseInstruction(str);
-				
+	
+			case Compression:
+				return (CPInstruction) CompressionCPInstruction.parseInstruction(str);	
+			
+			case SpoofFused:
+				return SpoofCPInstruction.parseInstruction(str);
+			
 			case INVALID:
 			
 			default: 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
index 6658a88..5ca3847 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
@@ -73,6 +73,7 @@ import org.apache.sysml.runtime.instructions.spark.ReorgSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.RmmSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.SPInstruction;
 import org.apache.sysml.runtime.instructions.spark.SPInstruction.SPINSTRUCTION_TYPE;
+import org.apache.sysml.runtime.instructions.spark.SpoofSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.TernarySPInstruction;
 import org.apache.sysml.runtime.instructions.spark.Tsmm2SPInstruction;
 import org.apache.sysml.runtime.instructions.spark.TsmmSPInstruction;
@@ -277,10 +278,12 @@ public class SPInstructionParser extends InstructionParser
 		
 		String2SPInstructionType.put( "binuaggchain", SPINSTRUCTION_TYPE.BinUaggChain);
 		
-		String2SPInstructionType.put( "write"   , SPINSTRUCTION_TYPE.Write);
+		String2SPInstructionType.put( "write"	, SPINSTRUCTION_TYPE.Write);
 	
-		String2SPInstructionType.put( "castdtm"   , SPINSTRUCTION_TYPE.Cast);
-		String2SPInstructionType.put( "castdtf"   , SPINSTRUCTION_TYPE.Cast);
+		String2SPInstructionType.put( "castdtm" , SPINSTRUCTION_TYPE.Cast);
+		String2SPInstructionType.put( "castdtf"	, SPINSTRUCTION_TYPE.Cast);
+		
+		String2SPInstructionType.put( "spoof"	, SPINSTRUCTION_TYPE.SpoofFused);
 	}
 
 	public static SPInstruction parseSingleInstruction (String str ) 
@@ -443,10 +446,13 @@ public class SPInstructionParser extends InstructionParser
 				
 			case Checkpoint:
 				return CheckpointSPInstruction.parseInstruction(str);
-			
+
 			case Compression:
 				return CompressionSPInstruction.parseInstruction(str);
 			
+			case SpoofFused:
+				return SpoofSPInstruction.parseInstruction(str);
+				
 			case Cast:
 				return CastSPInstruction.parseInstruction(str);
 				

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
index 1d192d5..dcd8d89 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
@@ -29,7 +29,13 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
 
 public abstract class CPInstruction extends Instruction 
 {
-	public enum CPINSTRUCTION_TYPE { INVALID, AggregateUnary, AggregateBinary, AggregateTernary, ArithmeticBinary, Ternary, Quaternary, BooleanBinary, BooleanUnary, BuiltinBinary, BuiltinUnary, BuiltinMultiple, MultiReturnParameterizedBuiltin, ParameterizedBuiltin, MultiReturnBuiltin, Builtin, Reorg, RelationalBinary, File, Variable, External, Append, Rand, QSort, QPick, MatrixIndexing, MMTSJ, PMMJ, MMChain, MatrixReshape, Partition, Compression, StringInit, CentralMoment, Covariance, UaggOuterChain, Convolution };
+	public enum CPINSTRUCTION_TYPE { INVALID, 
+		AggregateUnary, AggregateBinary, AggregateTernary, ArithmeticBinary, 
+		Ternary, Quaternary, BooleanBinary, BooleanUnary, BuiltinBinary, BuiltinUnary, 
+		BuiltinMultiple, MultiReturnParameterizedBuiltin, ParameterizedBuiltin, MultiReturnBuiltin, 
+		Builtin, Reorg, RelationalBinary, File, Variable, External, Append, Rand, QSort, QPick, 
+		MatrixIndexing, MMTSJ, PMMJ, MMChain, MatrixReshape, Partition, Compression, SpoofFused,
+		StringInit, CentralMoment, Covariance, UaggOuterChain, Convolution };
 	
 	protected CPINSTRUCTION_TYPE _cptype;
 	protected Operator _optr;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
new file mode 100644
index 0000000..61313d7
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/SpoofCPInstruction.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.cp;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.codegen.CodegenUtils;
+import org.apache.sysml.runtime.codegen.SpoofOperator;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.ComputationCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+public class SpoofCPInstruction extends ComputationCPInstruction
+{
+	private Class<?> _class = null;
+	private int _numThreads = 1;
+	private CPOperand[] _in = null;
+	
+	public SpoofCPInstruction(Class<?> cla, int k, CPOperand[] in, CPOperand out, String opcode, String str) {
+		super(null, null, null, out, opcode, str);
+		_class = cla;
+		_numThreads = k;
+		_in = in;
+	}
+
+	public static SpoofCPInstruction parseInstruction(String str) 
+		throws DMLRuntimeException 
+	{
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		
+		//String opcode = parts[0];
+		ArrayList<CPOperand> inlist = new ArrayList<CPOperand>();
+		Class<?> cla = CodegenUtils.loadClass(parts[1], null);
+		String opcode =  parts[0] + CodegenUtils.getSpoofType(cla);
+		
+		for( int i=2; i<parts.length-2; i++ )
+			inlist.add(new CPOperand(parts[i]));
+		CPOperand out = new CPOperand(parts[parts.length-2]);
+		int k = Integer.parseInt(parts[parts.length-1]);
+		
+		return new SpoofCPInstruction(cla, k, inlist.toArray(new CPOperand[0]), out, opcode, str);
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec)
+		throws DMLRuntimeException 
+	{		
+		SpoofOperator op = (SpoofOperator) CodegenUtils.createInstance(_class);
+		
+		//get input matrices and scalars, incl pinning of matrices
+		ArrayList<MatrixBlock> inputs = new ArrayList<MatrixBlock>();
+		ArrayList<ScalarObject> scalars = new ArrayList<ScalarObject>();
+		for (CPOperand input : _in) {
+			if(input.getDataType()==DataType.MATRIX)
+				inputs.add(ec.getMatrixInput(input.getName()));
+			else if(input.getDataType()==DataType.SCALAR)
+				scalars.add(ec.getScalarInput(input.getName(), input.getValueType(), input.isLiteral()));
+		}
+		
+		// set the output dimensions to the hop node matrix dimensions
+		if( output.getDataType() == DataType.MATRIX) {
+			MatrixBlock out = new MatrixBlock();
+			op.execute(inputs, scalars, out, _numThreads);
+			ec.setMatrixOutput(output.getName(), out);
+		}
+		else if (output.getDataType() == DataType.SCALAR) {
+			ScalarObject out = op.execute(inputs, scalars, _numThreads);
+			ec.setScalarOutput(output.getName(), out);
+		}
+		
+		// release input matrices
+		for (CPOperand input : _in)
+			if(input.getDataType()==DataType.MATRIX)
+				ec.releaseMatrixInput(input.getName());
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/spark/SPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SPInstruction.java
index b28e408..17d1561 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/SPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SPInstruction.java
@@ -37,7 +37,7 @@ public abstract class SPInstruction extends Instruction
 		CentralMoment, Covariance, QSort, QPick, 
 		ParameterizedBuiltin, MAppend, RAppend, GAppend, GAlignedAppend, Rand, 
 		MatrixReshape, Ternary, Quaternary, CumsumAggregate, CumsumOffset, BinUaggChain, UaggOuterChain, 
-		Write, INVALID, 
+		Write, SpoofFused, INVALID, 
 		Convolution
 	};
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
new file mode 100644
index 0000000..15b0751
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/SpoofSPInstruction.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.spark;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.PairFlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.codegen.CodegenUtils;
+import org.apache.sysml.runtime.codegen.SpoofCellwise;
+import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+import org.apache.sysml.runtime.codegen.SpoofOperator;
+import org.apache.sysml.runtime.codegen.SpoofOuterProduct;
+import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;
+import org.apache.sysml.runtime.codegen.SpoofRowAggregate;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.DoubleObject;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.instructions.spark.SPInstruction;
+import org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast;
+import org.apache.sysml.runtime.instructions.spark.utils.RDDAggregateUtils;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
+
+import scala.Tuple2;
+
+public class SpoofSPInstruction extends SPInstruction
+{
+	private final Class<?> _class;
+	private final byte[] _classBytes;
+	private final CPOperand[] _in;
+	private final CPOperand _out;
+	
+	public SpoofSPInstruction(Class<?> cls , byte[] classBytes, CPOperand[] in, CPOperand out, String opcode, String str) {
+		super(opcode, str);
+		_class = cls;
+		_classBytes = classBytes;
+		_sptype = SPINSTRUCTION_TYPE.SpoofFused;
+		_in = in;
+		_out = out;
+	}
+	
+	public static SpoofSPInstruction parseInstruction(String str) 
+		throws DMLRuntimeException
+	{
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		
+		//String opcode = parts[0];
+		ArrayList<CPOperand> inlist = new ArrayList<CPOperand>();
+		Class<?> cls = CodegenUtils.loadClass(parts[1], null);
+		byte[] classBytes = CodegenUtils.getClassAsByteArray(parts[1]);
+		String opcode =  parts[0] + CodegenUtils.getSpoofType(cls);
+		
+		for( int i=2; i<parts.length-2; i++ )
+			inlist.add(new CPOperand(parts[i]));
+		CPOperand out = new CPOperand(parts[parts.length-2]);
+		//note: number of threads parts[parts.length-1] always ignored
+		
+		return new SpoofSPInstruction(cls, classBytes, inlist.toArray(new CPOperand[0]), out, opcode, str);
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec)
+		throws DMLRuntimeException 
+	{	
+		SparkExecutionContext sec = (SparkExecutionContext)ec;
+
+		//get input rdd and variable name
+		ArrayList<String> bcVars = new ArrayList<String>();
+		MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(_in[0].getName());
+		JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable( _in[0].getName() );
+		JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
+				
+		//simple case: map-side only operation (one rdd input, broadcast all)
+		//keep track of broadcast variables
+		ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices = new ArrayList<PartitionedBroadcast<MatrixBlock>>();
+		ArrayList<ScalarObject> scalars = new ArrayList<ScalarObject>();
+		for( int i=1; i<_in.length; i++ ) {
+			if( _in[i].getDataType()==DataType.MATRIX) {
+				bcMatrices.add(sec.getBroadcastForVariable(_in[i].getName()));
+				bcVars.add(_in[i].getName());
+			}
+			else if(_in[i].getDataType()==DataType.SCALAR) {
+				scalars.add(sec.getScalarInput(_in[i].getName(), _in[i].getValueType(), _in[i].isLiteral()));
+			}
+		}
+		
+		//initialize Spark Operator
+		if(_class.getSuperclass() == SpoofCellwise.class) // cellwise operator
+		{
+			if( _out.getDataType()==DataType.MATRIX ) {
+				SpoofOperator op = (SpoofOperator) CodegenUtils.createInstance(_class); 	
+				
+				out = in.mapPartitionsToPair(new CellwiseFunction(_class.getName(), _classBytes, bcMatrices, scalars), true);
+				if( ((SpoofCellwise)op).getCellType()==CellType.ROW_AGG && mcIn.getCols() > mcIn.getColsPerBlock() ) {
+					//NOTE: workaround with partition size needed due to potential bug in SPARK
+					//TODO investigate if some other side effect of correct blocks
+					if( out.partitions().size() > mcIn.getNumRowBlocks() )
+						out = RDDAggregateUtils.sumByKeyStable(out, (int)mcIn.getNumRowBlocks());
+					else
+						out = RDDAggregateUtils.sumByKeyStable(out);
+				}
+				sec.setRDDHandleForVariable(_out.getName(), out);
+				
+				//maintain lineage information for output rdd
+				sec.addLineageRDD(_out.getName(), _in[0].getName());
+				for( String bcVar : bcVars )
+					sec.addLineageBroadcast(_out.getName(), bcVar);
+				
+				//update matrix characteristics
+				updateOutputMatrixCharacteristics(sec, op);	
+			}
+			else { //SCALAR
+				out = in.mapPartitionsToPair(new CellwiseFunction(_class.getName(), _classBytes, bcMatrices, scalars), true);
+				MatrixBlock tmpMB = RDDAggregateUtils.sumStable(out);
+				sec.setVariable(_out.getName(), new DoubleObject(tmpMB.getValue(0, 0)));
+			}
+		}
+		else if(_class.getSuperclass() == SpoofOuterProduct.class) // outer product operator
+		{
+			if( _out.getDataType()==DataType.MATRIX ) {
+				SpoofOperator op = (SpoofOperator) CodegenUtils.createInstance(_class); 	
+				OutProdType type = ((SpoofOuterProduct)op).getOuterProdType();
+
+				//update matrix characteristics
+				updateOutputMatrixCharacteristics(sec, op);			
+				MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
+				
+				out = in.mapPartitionsToPair(new OuterProductFunction(_class.getName(), _classBytes, bcMatrices, scalars), true);
+				if(type == OutProdType.LEFT_OUTER_PRODUCT || type == OutProdType.RIGHT_OUTER_PRODUCT ) {
+					//NOTE: workaround with partition size needed due to potential bug in SPARK
+					//TODO investigate if some other side effect of correct blocks
+					if( in.partitions().size() > mcOut.getNumRowBlocks()*mcOut.getNumColBlocks() )
+						out = RDDAggregateUtils.sumByKeyStable( out, (int)(mcOut.getNumRowBlocks()*mcOut.getNumColBlocks()) );
+					else
+						out = RDDAggregateUtils.sumByKeyStable( out );	
+				}
+				sec.setRDDHandleForVariable(_out.getName(), out);
+				
+				//maintain lineage information for output rdd
+				sec.addLineageRDD(_out.getName(), _in[0].getName());
+				for( String bcVar : bcVars )
+					sec.addLineageBroadcast(_out.getName(), bcVar);
+				
+			}
+			else {
+				out = in.mapPartitionsToPair(new OuterProductFunction(_class.getName(), _classBytes, bcMatrices, scalars), true);
+				MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
+				sec.setVariable(_out.getName(), new DoubleObject(tmp.getValue(0, 0)));
+			}
+		}
+		else if( _class.getSuperclass() == SpoofRowAggregate.class ) { //row aggregate operator
+			RowAggregateFunction fmmc = new RowAggregateFunction(_class.getName(), _classBytes, bcMatrices, scalars);
+			JavaPairRDD<MatrixIndexes,MatrixBlock> tmpRDD = in.mapToPair(fmmc);
+			MatrixBlock tmpMB = RDDAggregateUtils.sumStable(tmpRDD);		
+			sec.setMatrixOutput(_out.getName(), tmpMB);
+			return;
+		}
+		else {
+			throw new DMLRuntimeException("Operator " + _class.getSuperclass() + " is not supported on Spark");
+		}
+	}
+	
+	private void updateOutputMatrixCharacteristics(SparkExecutionContext sec, SpoofOperator op) 
+		throws DMLRuntimeException 
+	{
+		if(op instanceof SpoofCellwise)
+		{
+			MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(_in[0].getName());
+			MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
+			if( ((SpoofCellwise)op).getCellType()==CellType.ROW_AGG )
+				mcOut.set(mcIn.getRows(), 1, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
+			else if( ((SpoofCellwise)op).getCellType()==CellType.NO_AGG )
+				mcOut.set(mcIn);
+		}
+		else if(op instanceof SpoofOuterProduct)
+		{
+			MatrixCharacteristics mcIn1 = sec.getMatrixCharacteristics(_in[0].getName()); //X
+			MatrixCharacteristics mcIn2 = sec.getMatrixCharacteristics(_in[1].getName()); //U
+			MatrixCharacteristics mcIn3 = sec.getMatrixCharacteristics(_in[2].getName()); //V
+			MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(_out.getName());
+			OutProdType type = ((SpoofOuterProduct)op).getOuterProdType();
+			
+			if( type == OutProdType.CELLWISE_OUTER_PRODUCT)
+				mcOut.set(mcIn1.getRows(), mcIn1.getCols(), mcIn1.getRowsPerBlock(), mcIn1.getColsPerBlock());
+			else if( type == OutProdType.LEFT_OUTER_PRODUCT) 		
+				mcOut.set(mcIn3.getRows(), mcIn3.getCols(), mcIn3.getRowsPerBlock(), mcIn3.getColsPerBlock());		
+			else if( type == OutProdType.RIGHT_OUTER_PRODUCT )
+				mcOut.set(mcIn2.getRows(), mcIn2.getCols(), mcIn2.getRowsPerBlock(), mcIn2.getColsPerBlock());
+		}
+	}
+		
+	private static class RowAggregateFunction implements PairFunction<Tuple2<MatrixIndexes, MatrixBlock>, MatrixIndexes, MatrixBlock> 
+	{
+		private static final long serialVersionUID = -7926980450209760212L;
+
+		private ArrayList<PartitionedBroadcast<MatrixBlock>> _vectors = null;
+		private ArrayList<ScalarObject> _scalars = null;
+		private byte[] _classBytes = null;
+		private String _className = null;
+		private SpoofOperator _op = null;
+		
+		public RowAggregateFunction(String className, byte[] classBytes, ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices, ArrayList<ScalarObject> scalars) 
+			throws DMLRuntimeException
+		{			
+			_className = className;
+			_classBytes = classBytes;
+			_vectors = bcMatrices;
+			_scalars = scalars;
+		}
+		
+		@Override
+		public Tuple2<MatrixIndexes, MatrixBlock> call( Tuple2<MatrixIndexes, MatrixBlock> arg0 ) 
+			throws Exception 
+		{
+			//lazy load of shipped class
+			if( _op == null ) {
+				Class<?> loadedClass = CodegenUtils.loadClass(_className, _classBytes);
+				_op = (SpoofOperator) CodegenUtils.createInstance(loadedClass); 
+			}
+			
+			//get main input block and indexes
+			MatrixIndexes ixIn = arg0._1();
+			MatrixBlock blkIn = arg0._2();
+			int rowIx = (int)ixIn.getRowIndex();
+			
+			//prepare output and execute single-threaded operator
+			ArrayList<MatrixBlock> inputs = getVectorInputsFromBroadcast(blkIn, rowIx);
+			MatrixIndexes ixOut = new MatrixIndexes(1,1);
+			MatrixBlock blkOut = new MatrixBlock();
+			_op.execute(inputs, _scalars, blkOut);
+			
+			//output new tuple
+			return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut);
+		}
+		
+		private ArrayList<MatrixBlock> getVectorInputsFromBroadcast(MatrixBlock blkIn, int rowIndex) 
+			throws DMLRuntimeException 
+		{
+			ArrayList<MatrixBlock> ret = new ArrayList<MatrixBlock>();
+			ret.add(blkIn);
+			for( PartitionedBroadcast<MatrixBlock> vector : _vectors )
+				ret.add(vector.getBlock((vector.getNumRowBlocks()>=rowIndex)?rowIndex:1, 1));
+			return ret;
+		}
+	}
+	
+	private static class CellwiseFunction implements PairFlatMapFunction<Iterator<Tuple2<MatrixIndexes, MatrixBlock>>, MatrixIndexes, MatrixBlock> 
+	{
+		private static final long serialVersionUID = -8209188316939435099L;
+		
+		private ArrayList<PartitionedBroadcast<MatrixBlock>> _vectors = null;
+		private ArrayList<ScalarObject> _scalars = null;
+		private byte[] _classBytes = null;
+		private String _className = null;
+		private SpoofOperator _op = null;
+		
+		public CellwiseFunction(String className, byte[] classBytes, ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices, ArrayList<ScalarObject> scalars) 
+			throws DMLRuntimeException
+		{
+			_className = className;
+			_classBytes = classBytes;
+			_vectors = bcMatrices;
+			_scalars = scalars;
+		}
+		
+		@Override
+		public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<MatrixIndexes, MatrixBlock>> arg)
+			throws Exception 
+		{
+			//lazy load of shipped class
+			if( _op == null ) {
+				Class<?> loadedClass = CodegenUtils.loadClass(_className, _classBytes);
+				_op = (SpoofOperator) CodegenUtils.createInstance(loadedClass); 
+			}
+			
+			List<Tuple2<MatrixIndexes, MatrixBlock>> ret = new ArrayList<Tuple2<MatrixIndexes,MatrixBlock>>();
+			while(arg.hasNext()) 
+			{
+				Tuple2<MatrixIndexes,MatrixBlock> tmp = arg.next();
+				MatrixIndexes ixIn = tmp._1();
+				MatrixBlock blkIn = tmp._2();
+				MatrixIndexes ixOut = ixIn; 
+				MatrixBlock blkOut = new MatrixBlock();
+				ArrayList<MatrixBlock> inputs = getVectorInputsFromBroadcast(blkIn, (int)ixIn.getRowIndex());
+					
+				//execute core operation
+				if(((SpoofCellwise)_op).getCellType()==CellType.FULL_AGG) {
+					ScalarObject obj = _op.execute(inputs, _scalars, 1);
+					blkOut.reset(1, 1);
+					blkOut.quickSetValue(0, 0, obj.getDoubleValue());	
+				}
+				else {
+					if(((SpoofCellwise)_op).getCellType()==CellType.ROW_AGG)
+						ixOut = new MatrixIndexes(ixOut.getRowIndex(), 1);
+					_op.execute(inputs, _scalars, blkOut);
+				}
+				ret.add(new Tuple2<MatrixIndexes,MatrixBlock>(ixOut, blkOut));
+			}
+			return ret.iterator();
+		}
+		
+		private ArrayList<MatrixBlock> getVectorInputsFromBroadcast(MatrixBlock blkIn, int rowIndex) 
+			throws DMLRuntimeException 
+		{
+			ArrayList<MatrixBlock> ret = new ArrayList<MatrixBlock>();
+			ret.add(blkIn);
+			for( PartitionedBroadcast<MatrixBlock> vector : _vectors )
+				ret.add(vector.getBlock((vector.getNumRowBlocks()>=rowIndex)?rowIndex:1, 1));
+			return ret;
+		}
+	}	
+	
+	private static class OuterProductFunction implements PairFlatMapFunction<Iterator<Tuple2<MatrixIndexes, MatrixBlock>>, MatrixIndexes, MatrixBlock> 
+	{
+		private static final long serialVersionUID = -8209188316939435099L;
+		
+		private ArrayList<PartitionedBroadcast<MatrixBlock>> _bcMatrices = null;
+		private ArrayList<ScalarObject> _scalars = null;
+		private byte[] _classBytes = null;
+		private String _className = null;
+		private SpoofOperator _op = null;
+		
+		public OuterProductFunction(String className, byte[] classBytes, ArrayList<PartitionedBroadcast<MatrixBlock>> bcMatrices, ArrayList<ScalarObject> scalars) 
+				throws DMLRuntimeException
+		{
+			_className = className;
+			_classBytes = classBytes;
+			_bcMatrices = bcMatrices;
+			_scalars = scalars;
+		}
+		
+		@Override
+		public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<Tuple2<MatrixIndexes, MatrixBlock>> arg)
+			throws Exception 
+		{
+			//lazy load of shipped class
+			if( _op == null ) {
+				Class<?> loadedClass = CodegenUtils.loadClass(_className, _classBytes);
+				_op = (SpoofOperator) CodegenUtils.createInstance(loadedClass); 
+			}
+			
+			List<Tuple2<MatrixIndexes, MatrixBlock>> ret = new ArrayList<Tuple2<MatrixIndexes,MatrixBlock>>();
+			while(arg.hasNext())
+			{
+				Tuple2<MatrixIndexes,MatrixBlock> tmp = arg.next();
+				MatrixIndexes ixIn = tmp._1();
+				MatrixBlock blkIn = tmp._2();
+				MatrixBlock blkOut = new MatrixBlock();
+
+				ArrayList<MatrixBlock> inputs = new ArrayList<MatrixBlock>();
+				inputs.add(blkIn);
+				inputs.add(_bcMatrices.get(0).getBlock((int)ixIn.getRowIndex(), 1)); // U
+				inputs.add(_bcMatrices.get(1).getBlock((int)ixIn.getColumnIndex(), 1)); // V
+						
+				//execute core operation
+				if(((SpoofOuterProduct)_op).getOuterProdType()==OutProdType.AGG_OUTER_PRODUCT) {
+					ScalarObject obj = _op.execute(inputs, _scalars,1);
+					blkOut.reset(1, 1);
+					blkOut.quickSetValue(0, 0, obj.getDoubleValue());
+				}
+				else {
+					_op.execute(inputs, _scalars, blkOut);
+				}
+				
+				ret.add(new Tuple2<MatrixIndexes,MatrixBlock>(createOutputIndexes(ixIn,_op), blkOut));				
+			}
+			
+			return ret.iterator();
+		}
+		
+		private MatrixIndexes createOutputIndexes(MatrixIndexes in, SpoofOperator spoofOp) {
+			if( ((SpoofOuterProduct)spoofOp).getOuterProdType() == OutProdType.LEFT_OUTER_PRODUCT ) 
+				return new MatrixIndexes(in.getColumnIndex(), 1);
+			else if ( ((SpoofOuterProduct)spoofOp).getOuterProdType() == OutProdType.RIGHT_OUTER_PRODUCT)
+				return new MatrixIndexes(in.getRowIndex(), 1);
+			else 
+				return in;
+		}		
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/982ecb1a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDAggregateUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDAggregateUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDAggregateUtils.java
index 61c950a..2dfff74 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDAggregateUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDAggregateUtils.java
@@ -69,13 +69,17 @@ public class RDDAggregateUtils
 		}
 	}
 
-	public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in )
+	public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in ) {
+		return sumByKeyStable(in, in.getNumPartitions());
+	}
+	
+	public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, int numPartitions  )
 	{
 		//stable sum of blocks per key, by passing correction blocks along with aggregates 		
 		JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
 				in.combineByKey( new CreateCorrBlockCombinerFunction(), 
 							     new MergeSumBlockValueFunction(), 
-							     new MergeSumBlockCombinerFunction() );
+							     new MergeSumBlockCombinerFunction(), numPartitions );
 		
 		//strip-off correction blocks from 					     
 		JavaPairRDD<MatrixIndexes, MatrixBlock> out =

[3/9] incubator-systemml git commit: [SYSTEMML-1285] New basic code generator for operator fusion

Posted by mb...@apache.org.

[SYSTEMML-1285] New basic code generator for operator fusion

This patch introduces a cleaned-up version of SPOOF's basic code
generator, covering its core compiler and runtime operators as well as
its basic integration into the stats and explain tools (SYSTEMML-1296
and SYSTEMML-1297).

Furthermore, this also includes the following minor fixes and
improvements of existing components:

* Fix of rewrite utils for creating binary scalar operations with
boolean outputs
* Cleanup instruction generation convolution lop
* Fix lop dag compilation (removed constraint of max 7 input lops)
* Improved value type handling of scalar comparison instructions
* Fix various gpu-related src and javadoc warnings


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d7fd5879
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d7fd5879
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d7fd5879

Branch: refs/heads/master
Commit: d7fd58795c06dea8db6fb55a045a8b312547f398
Parents: b78c125
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Feb 26 18:53:46 2017 -0800
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Feb 26 18:53:46 2017 -0800

----------------------------------------------------------------------
 src/main/java/org/apache/sysml/hops/Hop.java    |   5 +
 .../sysml/hops/codegen/SpoofCompiler.java       | 407 ++++++++++++++
 .../apache/sysml/hops/codegen/SpoofFusedOp.java | 212 ++++++++
 .../apache/sysml/hops/codegen/cplan/CNode.java  | 167 ++++++
 .../sysml/hops/codegen/cplan/CNodeBinary.java   | 260 +++++++++
 .../sysml/hops/codegen/cplan/CNodeCell.java     | 144 +++++
 .../sysml/hops/codegen/cplan/CNodeData.java     |  94 ++++
 .../hops/codegen/cplan/CNodeOuterProduct.java   | 165 ++++++
 .../hops/codegen/cplan/CNodeRowAggVector.java   | 111 ++++
 .../sysml/hops/codegen/cplan/CNodeTpl.java      | 201 +++++++
 .../sysml/hops/codegen/cplan/CNodeUnary.java    | 206 +++++++
 .../sysml/hops/codegen/template/BaseTpl.java    |  63 +++
 .../sysml/hops/codegen/template/CellTpl.java    | 289 ++++++++++
 .../hops/codegen/template/CplanRegister.java    | 168 ++++++
 .../hops/codegen/template/OuterProductTpl.java  | 489 +++++++++++++++++
 .../sysml/hops/codegen/template/RowAggTpl.java  | 321 +++++++++++
 .../hops/codegen/template/TemplateUtils.java    | 313 +++++++++++
 .../sysml/hops/rewrite/HopRewriteUtils.java     |  10 +-
 .../apache/sysml/lops/ConvolutionTransform.java |  49 +-
 src/main/java/org/apache/sysml/lops/Lop.java    | 107 ++--
 .../java/org/apache/sysml/lops/SpoofFused.java  | 119 ++++
 .../java/org/apache/sysml/lops/compile/Dag.java |  63 +--
 .../sysml/runtime/codegen/ByteClassLoader.java  |  40 ++
 .../sysml/runtime/codegen/CodegenUtils.java     | 268 +++++++++
 .../runtime/codegen/LibSpoofPrimitives.java     | 257 +++++++++
 .../sysml/runtime/codegen/SpoofCellwise.java    | 430 +++++++++++++++
 .../sysml/runtime/codegen/SpoofOperator.java    |  74 +++
 .../runtime/codegen/SpoofOuterProduct.java      | 541 +++++++++++++++++++
 .../runtime/codegen/SpoofRowAggregate.java      | 188 +++++++
 .../controlprogram/parfor/util/IDSequence.java  |  21 +-
 .../cp/RelationalBinaryCPInstruction.java       |  52 +-
 .../cp/ScalarScalarRelationalCPInstruction.java |  22 +-
 .../instructions/gpu/context/GPUContext.java    |   2 +
 .../instructions/gpu/context/GPUObject.java     |   1 +
 .../instructions/gpu/context/JCudaObject.java   |   2 +
 .../runtime/matrix/data/LibMatrixMult.java      |  18 +-
 .../sysml/runtime/util/LocalFileUtils.java      |  24 +
 .../java/org/apache/sysml/utils/Statistics.java |  73 +++
 38 files changed, 5742 insertions(+), 234 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index 3aa3dab..4021a1a 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -789,6 +789,11 @@ public abstract class Hop
 	public ArrayList<Hop> getInput() {
 		return _input;
 	}
+	
+	public void addInput( Hop h ) {
+		_input.add(h);
+		h._parent.add(this);
+	}
 
 	public long getRowsInBlock() {
 		return _rows_in_block;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
new file mode 100644
index 0000000..dd24703
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map.Entry;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLException;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeCell;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeOuterProduct;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.template.BaseTpl;
+import org.apache.sysml.hops.codegen.template.CellTpl;
+import org.apache.sysml.hops.codegen.template.CplanRegister;
+import org.apache.sysml.hops.codegen.template.OuterProductTpl;
+import org.apache.sysml.hops.codegen.template.RowAggTpl;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.HopsException;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.parser.DMLProgram;
+import org.apache.sysml.parser.ForStatement;
+import org.apache.sysml.parser.ForStatementBlock;
+import org.apache.sysml.parser.FunctionStatement;
+import org.apache.sysml.parser.FunctionStatementBlock;
+import org.apache.sysml.parser.IfStatement;
+import org.apache.sysml.parser.IfStatementBlock;
+import org.apache.sysml.parser.LanguageException;
+import org.apache.sysml.parser.StatementBlock;
+import org.apache.sysml.parser.WhileStatement;
+import org.apache.sysml.parser.WhileStatementBlock;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.codegen.CodegenUtils;
+import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+import org.apache.sysml.utils.Explain;
+import org.apache.sysml.utils.Explain.ExplainType;
+import org.apache.sysml.utils.Statistics;
+
+public class SpoofCompiler 
+{
+	private static final Log LOG = LogFactory.getLog(SpoofCompiler.class.getName());
+	
+	public static boolean OPTIMIZE = true;
+	
+	//internal configuration flags
+	public static final boolean LDEBUG = false;
+	public static final boolean SUM_PRODUCT = false;
+	public static final boolean RECOMPILE = true;
+	public static boolean USE_PLAN_CACHE = true;
+	public static boolean ALWAYS_COMPILE_LITERALS = false;
+	public static final boolean ALLOW_SPARK_OPS = false;
+	
+	//plan cache for cplan->compiled source to avoid unnecessary codegen/source code compile
+	//for equal operators from (1) different hop dags and (2) repeated recompilation 
+	private static ConcurrentHashMap<CNode, Class<?>> planCache = new ConcurrentHashMap<CNode, Class<?>>();
+	
+	public static void generateCode(DMLProgram dmlp) 
+		throws LanguageException, HopsException, DMLRuntimeException
+	{	
+		// cleanup static plan cache
+		planCache.clear();
+		
+		// for each namespace, handle function statement blocks
+		for (String namespaceKey : dmlp.getNamespaces().keySet()) {
+			for (String fname : dmlp.getFunctionStatementBlocks(namespaceKey).keySet()) {
+				FunctionStatementBlock fsblock = dmlp.getFunctionStatementBlock(namespaceKey,fname);
+				generateCodeFromStatementBlock(fsblock);
+			}
+		}
+		
+		// handle regular statement blocks in "main" method
+		for (int i = 0; i < dmlp.getNumStatementBlocks(); i++) {
+			StatementBlock current = dmlp.getStatementBlock(i);
+			generateCodeFromStatementBlock(current);
+		}
+	}
+	
+	public static void generateCodeFromStatementBlock(StatementBlock current)
+		throws HopsException, DMLRuntimeException
+	{		
+		if (current instanceof FunctionStatementBlock)
+		{
+			FunctionStatementBlock fsb = (FunctionStatementBlock)current;
+			FunctionStatement fstmt = (FunctionStatement)fsb.getStatement(0);
+			for (StatementBlock sb : fstmt.getBody())
+				generateCodeFromStatementBlock(sb);
+		}
+		else if (current instanceof WhileStatementBlock)
+		{
+			WhileStatementBlock wsb = (WhileStatementBlock) current;
+			WhileStatement wstmt = (WhileStatement)wsb.getStatement(0);
+			wsb.setPredicateHops(optimize(wsb.getPredicateHops(), true));
+			for (StatementBlock sb : wstmt.getBody())
+				generateCodeFromStatementBlock(sb);
+		}	
+		else if (current instanceof IfStatementBlock)
+		{
+			IfStatementBlock isb = (IfStatementBlock) current;
+			IfStatement istmt = (IfStatement)isb.getStatement(0);
+			isb.setPredicateHops(optimize(isb.getPredicateHops(), true));
+			for (StatementBlock sb : istmt.getIfBody())
+				generateCodeFromStatementBlock(sb);
+			for (StatementBlock sb : istmt.getElseBody())
+				generateCodeFromStatementBlock(sb);
+		}
+		else if (current instanceof ForStatementBlock) //incl parfor
+		{
+			ForStatementBlock fsb = (ForStatementBlock) current;
+			ForStatement fstmt = (ForStatement)fsb.getStatement(0);
+			fsb.setFromHops(optimize(fsb.getFromHops(), true));
+			fsb.setToHops(optimize(fsb.getToHops(), true));
+			fsb.setIncrementHops(optimize(fsb.getIncrementHops(), true));
+			for (StatementBlock sb : fstmt.getBody())
+				generateCodeFromStatementBlock(sb);
+		}
+		else //generic (last-level)
+		{
+			current.set_hops( generateCodeFromHopDAGs(current.get_hops()) );
+			current.updateRecompilationFlag();
+		}
+	}
+
+	public static ArrayList<Hop> generateCodeFromHopDAGs(ArrayList<Hop> roots) 
+		throws HopsException, DMLRuntimeException
+	{
+		if( roots == null )
+			return roots;
+
+		ArrayList<Hop> optimized = SpoofCompiler.optimize(roots, true);
+		Hop.resetVisitStatus(roots);
+		Hop.resetVisitStatus(optimized);
+		
+		return optimized;
+	}
+	
+	
+	/**
+	 * Main interface of sum-product optimizer, predicate dag.
+	 * 
+	 * @param root dag root node
+	 * @param compileLiterals if true literals compiled as constants, otherwise as scalar variables
+	 * @return dag root node of modified dag
+	 * @throws DMLRuntimeException if optimization failed
+	 */
+	public static Hop optimize( Hop root, boolean compileLiterals ) throws DMLRuntimeException {
+		if( root == null )
+			return root;
+		
+		return optimize(new ArrayList<Hop>(Arrays.asList(root)), compileLiterals).get(0);
+	}
+	
+	/**
+	 * Main interface of sum-product optimizer, statement block dag.
+	 * 
+	 * @param roots dag root nodes
+	 * @param compileLiterals if true literals compiled as constants, otherwise as scalar variables
+	 * @return dag root nodes of modified dag 
+	 * @throws DMLRuntimeException if optimization failed
+	 */
+	@SuppressWarnings("unused")
+	public static ArrayList<Hop> optimize(ArrayList<Hop> roots, boolean compileLiterals) 
+		throws DMLRuntimeException 
+	{
+		if( roots == null || roots.isEmpty() || !OPTIMIZE )
+			return roots;
+	
+		long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+		ArrayList<Hop> ret = roots;
+		
+		try
+		{
+			//construct codegen plans
+			HashMap<Long, Pair<Hop[],CNodeTpl>>  cplans = constructCPlans(roots, compileLiterals);
+			
+			//cleanup codegen plans (remove unnecessary inputs, fix hop-cnodedata mapping,
+			//remove empty templates with single cnodedata input)
+			cplans = cleanupCPlans(cplans);
+					
+			//explain before modification
+			if( LDEBUG && cplans.size() > 0 ) { //existing cplans
+				LOG.info("Codegen EXPLAIN (before optimize): \n"+Explain.explainHops(roots));
+			}
+			
+			//source code generation for all cplans
+			HashMap<Long, Pair<Hop[],Class<?>>> clas = new HashMap<Long, Pair<Hop[],Class<?>>>();
+			for( Entry<Long, Pair<Hop[],CNodeTpl>> cplan : cplans.entrySet() ) {
+				Pair<Hop[],CNodeTpl> tmp = cplan.getValue();
+				
+				if( !USE_PLAN_CACHE || !planCache.containsKey(tmp.getValue()) ) {
+					//generate java source code
+					String src = tmp.getValue().codegen(false);
+					
+					//explain debug output generated source code
+					if( LDEBUG || DMLScript.EXPLAIN != ExplainType.NONE ) {
+						LOG.info("Codegen EXPLAIN (generated code for HopID: " +  cplan.getKey() +"):");
+						LOG.info(src);
+					}
+					
+					//compile generated java source code
+					Class<?> cla = CodegenUtils.compileClass(tmp.getValue().getClassname(), src);
+					planCache.put(tmp.getValue(), cla);
+				}
+				else if( LDEBUG || DMLScript.STATISTICS ) {
+					Statistics.incrementCodegenPlanCacheHits();
+				}
+				
+				Class<?> cla = planCache.get(tmp.getValue());
+				if(cla != null)
+					clas.put(cplan.getKey(), new Pair<Hop[],Class<?>>(tmp.getKey(),cla));
+				
+				if( LDEBUG || DMLScript.STATISTICS )
+					Statistics.incrementCodegenPlanCacheTotal();
+			}
+			
+			//generate final hop dag
+			ret = constructModifiedHopDag(roots, cplans, clas);
+			
+			//explain after modification
+			if( LDEBUG && cplans.size() > 0 ) { //existing cplans
+				LOG.info("Codegen EXPLAIN (after optimize): \n"+Explain.explainHops(roots));
+			}
+		}
+		catch( Exception ex ) {
+			throw new DMLRuntimeException(ex);
+		}
+		
+		if( DMLScript.STATISTICS ) {
+			Statistics.incrementCodegenDAGCompile();
+			Statistics.incrementCodegenCompileTime(System.nanoTime()-t0);
+		}
+			
+		return ret;
+	}
+
+	
+	////////////////////
+	// Codegen plan construction
+
+	private static HashMap<Long, Pair<Hop[],CNodeTpl>> constructCPlans(ArrayList<Hop> roots, boolean compileLiterals) throws DMLException
+	{
+		LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
+		for( Hop hop : roots ) {
+			CplanRegister perRootCplans = new CplanRegister();
+			HashSet<Long> memo = new HashSet<Long>();
+			rConstructCPlans(hop, perRootCplans, memo, compileLiterals);
+			
+			for (Entry<Long, Pair<Hop[],CNodeTpl>> entry : perRootCplans.getTopLevelCplans().entrySet())
+				if(!ret.containsKey(entry.getKey()))
+					ret.put(entry.getKey(), entry.getValue());
+		}
+		return ret;
+	}
+	
+	private static void rConstructCPlans(Hop hop, CplanRegister cplanReg, HashSet<Long> memo, boolean compileLiterals) throws DMLException
+	{		
+		if( memo.contains(hop.getHopID()) )
+			return;
+		
+		//construct template instances
+		BaseTpl[] templates = new BaseTpl[]{
+				new RowAggTpl(), new CellTpl(), new OuterProductTpl()};
+		
+		//process hop with all templates
+		for( BaseTpl tpl : templates ) {
+			if( tpl.openTpl(hop) && tpl.findTplBoundaries(hop,cplanReg) ) {
+				cplanReg.insertCpplans(tpl.getType(), 
+					tpl.constructTplCplan(compileLiterals));
+			}		
+		}
+		
+		//process childs recursively
+		memo.add(hop.getHopID());
+		for( Hop c : hop.getInput() )
+			rConstructCPlans(c, cplanReg, memo, compileLiterals);
+	}
+	
+	////////////////////
+	// Codegen hop dag construction
+
+	private static ArrayList<Hop> constructModifiedHopDag(ArrayList<Hop> orig, 
+			HashMap<Long, Pair<Hop[],CNodeTpl>> cplans, HashMap<Long, Pair<Hop[],Class<?>>> cla)
+	{
+		HashSet<Long> memo = new HashSet<Long>();
+		for( int i=0; i<orig.size(); i++ ) {
+			Hop hop = orig.get(i); //w/o iterator because modified
+			rConstructModifiedHopDag(hop, cplans, cla, memo);
+		}
+		return orig;
+	}
+	
+	private static void rConstructModifiedHopDag(Hop hop,  HashMap<Long, Pair<Hop[],CNodeTpl>> cplans,
+			HashMap<Long, Pair<Hop[],Class<?>>> clas, HashSet<Long> memo)
+	{
+		if( memo.contains(hop.getHopID()) )
+			return; //already processed
+		
+		Hop hnew = hop;
+		if( clas.containsKey(hop.getHopID()) ) 
+		{
+			//replace sub-dag with generated operator
+			Pair<Hop[], Class<?>> tmpCla = clas.get(hop.getHopID());
+			CNodeTpl tmpCNode = cplans.get(hop.getHopID()).getValue();
+			hnew = new SpoofFusedOp(hop.getName(), hop.getDataType(), hop.getValueType(), 
+					tmpCla.getValue(), false, tmpCNode.getOutputDimType());
+			for( Hop in : tmpCla.getKey() ) {
+				hnew.addInput(in); //add inputs
+			}
+			hnew.setOutputBlocksizes(hop.getRowsInBlock() , hop.getColsInBlock());
+			hnew.setDim1(hop.getDim1());
+			hnew.setDim2(hop.getDim2());
+			if(tmpCNode instanceof CNodeOuterProduct && ((CNodeOuterProduct)tmpCNode).isTransposeOutput() ) {
+				hnew = HopRewriteUtils.createTranspose(hnew);
+			}
+			
+			HopRewriteUtils.rewireAllParentChildReferences(hop, hnew);
+			memo.add(hnew.getHopID());
+		}
+		
+		//process hops recursively (parent-child links modified)
+		for( int i=0; i<hnew.getInput().size(); i++ ) {
+			Hop c = hnew.getInput().get(i);
+			rConstructModifiedHopDag(c, cplans, clas, memo);
+		}
+		memo.add(hnew.getHopID());
+	}
+	
+	/**
+	 * Cleanup generated cplans in order to remove unnecessary inputs created
+	 * during incremental construction. This is important as it avoids unnecessary 
+	 * redundant computation. 
+	 * 
+	 * @param cplans set of cplans
+	 */
+	private static HashMap<Long, Pair<Hop[],CNodeTpl>> cleanupCPlans(HashMap<Long, Pair<Hop[],CNodeTpl>> cplans) {
+		HashMap<Long, Pair<Hop[],CNodeTpl>> cplans2 = new HashMap<Long, Pair<Hop[],CNodeTpl>>();
+		for( Entry<Long, Pair<Hop[],CNodeTpl>> e : cplans.entrySet() ) {
+			CNodeTpl tpl = e.getValue().getValue();
+			Hop[] inHops = e.getValue().getKey();
+			
+			//collect cplan leaf node names
+			HashSet<Long> leafs = new HashSet<Long>();
+			rCollectLeafIDs(tpl.getOutput(), leafs);
+			
+			//create clean cplan w/ minimal inputs
+			if( inHops.length == leafs.size() )
+				cplans2.put(e.getKey(), e.getValue());
+			else {
+				tpl.cleanupInputs(leafs);
+				ArrayList<Hop> tmp = new ArrayList<Hop>();
+				for( Hop hop : inHops )
+					if( leafs.contains(hop.getHopID()) )
+						tmp.add(hop);
+				cplans2.put(e.getKey(), new Pair<Hop[],CNodeTpl>(
+						tmp.toArray(new Hop[0]),tpl));
+			}
+			
+			//remove cplan w/ single op and w/o agg
+			if( tpl instanceof CNodeCell && ((CNodeCell)tpl).getCellType()==CellType.NO_AGG
+				&& tpl.getOutput() instanceof CNodeUnary && tpl.getOutput().getInput().get(0) instanceof CNodeData) 
+				cplans2.remove(e.getKey());
+		
+			//remove cplan if empty
+			if( tpl.getOutput() instanceof CNodeData )
+				cplans2.remove(e.getKey());
+		}
+		
+		return cplans2;
+	}
+	
+	private static void rCollectLeafIDs(CNode node, HashSet<Long> leafs) {
+		//collect leaf variable names
+		if( node instanceof CNodeData )
+			leafs.add(((CNodeData) node).getHopID());
+		
+		//recursively process cplan
+		for( CNode c : node.getInput() )
+			rCollectLeafIDs(c, leafs);
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
new file mode 100644
index 0000000..357d41c
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.Hop.MultiThreadedHop;
+import org.apache.sysml.hops.HopsException;
+import org.apache.sysml.hops.MemoTable;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.lops.Lop;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.LopsException;
+import org.apache.sysml.lops.SpoofFused;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.parser.Expression.ValueType;
+
+public class SpoofFusedOp extends Hop implements MultiThreadedHop
+{
+	public enum SpoofOutputDimsType {
+		INPUT_DIMS,
+		ROW_DIMS,
+		COLUMN_DIMS_ROWS,
+		COLUMN_DIMS_COLS,
+		SCALAR,
+		ROW_RANK_DIMS, // right wdivmm 
+		COLUMN_RANK_DIMS  // left wdivmm
+	}
+	
+	private Class<?> _class = null;
+	private boolean _distSupported = false;
+	private int _numThreads = -1;
+	private SpoofOutputDimsType _dimsType;
+	
+	public SpoofFusedOp ( ) {
+	
+	}
+	
+	public SpoofFusedOp( String name, DataType dt, ValueType vt, Class<?> cla, boolean dist, SpoofOutputDimsType type ) {
+		super(name, dt, vt);
+		_class = cla;
+		_distSupported = dist;
+		_dimsType = type;
+	}
+	
+	@Override
+	public void setMaxNumThreads(int k) {
+		_numThreads = k;
+	}
+
+	@Override
+	public int getMaxNumThreads() {
+		return _numThreads;
+	}
+
+	@Override
+	public boolean allowsAllExecTypes() {
+		return _distSupported;
+	}
+
+	@Override
+	protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) {
+		return OptimizerUtils.estimateSize(dim1, dim2);
+	}
+
+	@Override
+	protected double computeIntermediateMemEstimate(long dim1, long dim2, long nnz) {
+		return 0;
+	}
+
+	@Override
+	protected long[] inferOutputCharacteristics(MemoTable memo) {
+		return null;
+	}
+
+	@Override
+	public Lop constructLops() throws HopsException, LopsException {
+		if( getLops() != null )
+			return getLops();
+		
+		ExecType et = optFindExecType();
+		
+		ArrayList<Lop> inputs = new ArrayList<Lop>();
+		for( Hop c : getInput() )
+			inputs.add(c.constructLops());
+		
+		int k = OptimizerUtils.getConstrainedNumThreads(_numThreads);
+		SpoofFused lop = new SpoofFused(inputs, getDataType(), getValueType(), _class, k, et);
+		setOutputDimensions(lop);
+		setLineNumbers(lop);
+		setLops(lop);
+	
+		return lop;
+	}
+	
+	@Override
+	protected ExecType optFindExecType() throws HopsException {
+		
+		checkAndSetForcedPlatform();
+		
+		if( _etypeForced != null ) {		
+			_etype = _etypeForced;
+		}
+		else {
+			_etype = findExecTypeByMemEstimate();
+			checkAndSetInvalidCPDimsAndSize();
+		}
+		
+		//ensure valid execution plans
+		if( _etype == ExecType.MR )
+			_etype = ExecType.CP;
+		
+		return _etype;
+	}
+
+	@Override
+	public String getOpString() {
+		return "spoof("+_class.getSimpleName()+")";
+	}
+
+	@Override
+	public void refreshSizeInformation() {
+		switch(_dimsType)
+		{
+			case ROW_DIMS:
+				setDim1(getInput().get(0).getDim1());
+				setDim2(1);
+				break;
+			case COLUMN_DIMS_ROWS:
+				setDim1(getInput().get(0).getDim2());
+				setDim2(1);
+				break;
+			case COLUMN_DIMS_COLS:
+				setDim1(1);
+				setDim2(getInput().get(0).getDim2());
+				break;
+			case INPUT_DIMS:
+				setDim1(getInput().get(0).getDim1());
+				setDim2(getInput().get(0).getDim2());
+				break;
+			case SCALAR:
+				setDim1(0);
+				setDim2(0);
+				break;
+			case ROW_RANK_DIMS:
+				setDim1(getInput().get(0).getDim1());
+				setDim2(getInput().get(1).getDim2());
+				break;
+			case COLUMN_RANK_DIMS:
+				setDim1(getInput().get(0).getDim2());
+				setDim2(getInput().get(1).getDim2());
+				break;
+			default:
+				throw new RuntimeException("Failed to refresh size information "
+						+ "for type: "+_dimsType.toString());
+		}
+	}
+
+	@Override
+	public Object clone() throws CloneNotSupportedException 
+	{
+		SpoofFusedOp ret = new SpoofFusedOp();	
+		
+		//copy generic attributes
+		ret.clone(this, false);
+		
+		//copy specific attributes
+		ret._class = _class;
+		ret._distSupported = _distSupported;
+		ret._numThreads = _numThreads;
+		ret._dimsType = _dimsType;
+		return ret;
+	}
+	
+	@Override
+	public boolean compare( Hop that )
+	{
+		if( !(that instanceof SpoofFusedOp) )
+			return false;
+		
+		SpoofFusedOp that2 = (SpoofFusedOp)that;		
+		boolean ret = ( _class.equals(that2._class)
+				&& _distSupported == that2._distSupported
+				&& _numThreads == that2._numThreads
+				&& getInput().size() == that2.getInput().size());
+		
+		if( ret ) {
+			for( int i=0; i<getInput().size(); i++ )
+				ret &= (getInput().get(i) == that2.getInput().get(i));
+		}
+		
+		return ret;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
new file mode 100644
index 0000000..46637cc
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
+
+public abstract class CNode
+{
+	private static final IDSequence _seq = new IDSequence();
+	
+	protected ArrayList<CNode> _inputs = null; 
+	protected CNode _output = null; 
+	protected boolean _generated = false;
+	protected String _genVar = null;
+	protected long _rows = -1;
+	protected long _cols = -1;
+	protected DataType _dataType;
+	protected boolean _literal = false;
+	
+	//cached hash to allow memoization in DAG structures and repeated 
+	//recursive hash computation over all inputs (w/ reset on updates)
+	protected int _hash = 0;
+	
+	public CNode() {
+		_inputs = new ArrayList<CNode>();
+		_generated = false;
+	}
+
+	public ArrayList<CNode> getInput() {
+		return _inputs;
+	}
+	
+	public String createVarname() {
+		_genVar = "TMP"+_seq.getNextID();
+		return _genVar; 
+	}
+	
+	protected String getCurrentVarName() {
+		return "TMP"+(_seq.getCurrentID()-1);
+	}
+	
+	public String getVarname() {
+		return _genVar;
+	}
+	
+	public String getClassname() {
+		return getVarname();
+	}
+	
+	public void resetGenerated() {
+		if( _generated )
+			for( CNode cn : _inputs )
+				cn.resetGenerated();
+		_generated = false;
+	}
+	
+	public void setNumRows(long rows) {
+		_rows = rows;
+	}
+	
+	public long getNumRows() {
+		return _rows;
+	}
+	
+	public void setNumCols(long cols) {
+		_cols = cols;
+	}
+	
+	public long getNumCols() {
+		return _cols;
+	}
+	
+	public DataType getDataType() {
+		return _dataType;
+	}
+	
+	public void setDataType(DataType dt) {
+		_dataType = dt;
+		_hash = 0;
+	}
+	
+	public boolean isLiteral() {
+		return _literal;
+	}
+	
+	public void setLiteral(boolean literal) {
+		_literal = literal;
+		_hash = 0;
+	}
+	
+	public CNode getOutput() {
+		return _output;
+	}
+	
+	public void setOutput(CNode output) {
+		_output = output;
+		_hash = 0;
+	}
+	
+	public abstract String codegen(boolean sparse) ;
+	
+	public abstract void setOutputDims();
+	
+	///////////////////////////////////////
+	// Functionality for plan cache
+	
+	//note: genvar/generated changed on codegen and not considered,
+	//rows and cols also not include to increase reuse potential
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int numIn = _inputs.size();
+			int[] tmp = new int[numIn + 3];
+			//include inputs, partitioned by matrices and scalars to increase 
+			//reuse in case of interleaved inputs (see CNodeTpl.renameInputs)
+			int pos = 0;
+			for( CNode c : _inputs )
+				if( c.getDataType()==DataType.MATRIX )
+					tmp[pos++] = c.hashCode();
+			for( CNode c : _inputs )
+				if( c.getDataType()!=DataType.MATRIX )
+					tmp[pos++] = c.hashCode();
+			tmp[numIn+0] = (_output!=null)?_output.hashCode():0;
+			tmp[numIn+1] = (_dataType!=null)?_dataType.hashCode():0;
+			tmp[numIn+2] = Boolean.hashCode(_literal);		
+			_hash = Arrays.hashCode(tmp);
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object that) {
+		if( !(that instanceof CNode) )
+			return false;
+		
+		CNode cthat = (CNode) that;
+		boolean ret = _inputs.size() == cthat._inputs.size();
+		for( int i=0; i<_inputs.size() && ret; i++ )
+			ret &= _inputs.get(i).equals(_inputs.get(i));
+		return ret 
+			&& (_output == cthat._output || _output.equals(cthat._output))
+			&& _dataType == cthat._dataType
+			&& _literal == cthat._literal;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
new file mode 100644
index 0000000..1bfaab4
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.Arrays;
+
+import org.apache.sysml.parser.Expression.DataType;
+
+
+public class CNodeBinary extends CNode
+{
+	public enum BinType {
+		DOT_PRODUCT,
+		VECT_MULT_ADD, VECT_DIV_ADD,
+		VECT_MULT_SCALAR, VECT_DIV_SCALAR, 
+		MULT, DIV, PLUS, MINUS, MODULUS, INTDIV, 
+		LESS, LESSEQUAL, GREATER, GREATEREQUAL, EQUAL,NOTEQUAL,
+		MIN, MAX, AND, OR, LOG, POW,
+		MINUS1_MULT;
+		
+		public static boolean contains(String value) {
+			for( BinType bt : values()  )
+				if( bt.toString().equals(value) )
+					return true;
+			return false;
+		}
+		
+		public boolean isCommutative() {
+			return ( this == EQUAL || this == NOTEQUAL 
+				|| this == PLUS || this == MULT 
+				|| this == MIN || this == MAX );
+		}
+		
+		public String getTemplate(boolean sparse) {
+			switch (this) {
+				case DOT_PRODUCT:   
+					return sparse ? "    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, %LEN%);\n" :
+									"    double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, %LEN%);\n";
+			
+				case VECT_MULT_ADD: 
+					return sparse ? "    LibSpoofPrimitives.vectMultiplyAdd(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, %POSOUT%, %LEN%);\n" : 
+									"    LibSpoofPrimitives.vectMultiplyAdd(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, %LEN%);\n";
+				
+				case VECT_DIV_ADD: 
+					return sparse ? "    LibSpoofPrimitives.vectDivAdd(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, %POSOUT%, %LEN%);\n" : 
+									"    LibSpoofPrimitives.vectDivAdd(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+				
+				case VECT_DIV_SCALAR: 
+					return sparse ? "    LibSpoofPrimitives.vectDivWrite(%IN1v%, %IN1i%, %IN2%,  %OUT%, %POS1%, %POSOUT%, %LEN%);\n" : 
+									"    LibSpoofPrimitives.vectDivWrite(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, %LEN%);\n";
+				
+				case VECT_MULT_SCALAR: 
+					return "    LibSpoofPrimitives.vectMultiplyWrite(%IN2%, %IN1%, %POS1%, %OUT%, 0, %LEN%);\n";
+							
+				
+				/*Can be replaced by function objects*/
+				case MULT:
+					return "    double %TMP% = %IN1% * %IN2%;\n" ;
+				
+				case DIV:
+					return "    double %TMP% = %IN1% / %IN2%;\n" ;
+				case PLUS:
+					return "    double %TMP% = %IN1% + %IN2%;\n" ;
+				case MINUS:
+					return "    double %TMP% = %IN1% - %IN2%;\n" ;
+				case MODULUS:
+					return "    double %TMP% = %IN1% % %IN2%;\n" ;
+				case INTDIV: 
+					return "    double %TMP% = (int) %IN1% / %IN2%;\n" ;
+				case LESS:
+					return "    double %TMP% = (%IN1% < %IN2%) ? 1 : 0;\n" ;
+				case LESSEQUAL:
+					return "    double %TMP% = (%IN1% <= %IN2%) ? 1 : 0;\n" ;
+				case GREATER:
+					return "    double %TMP% = (%IN1% > %IN2%) ? 1 : 0;\n" ;
+				case GREATEREQUAL: 
+					return "    double %TMP% = (%IN1% >= %IN2%) ? 1 : 0;\n" ;
+				case EQUAL:
+					return "    double %TMP% = (%IN1% == %IN2%) ? 1 : 0;\n" ;
+				case NOTEQUAL: 
+					return "    double %TMP% = (%IN1% != %IN2%) ? 1 : 0;\n" ;
+				
+				case MIN:
+					return "    double %TMP% = Math.min(%IN1%, %IN2%);\n" ;
+				case MAX:
+					return "    double %TMP% = Math.max(%IN1%, %IN2%);\n" ;
+				case LOG:
+					return "    double %TMP% = Math.log(%IN1%)/Math.log(%IN2%);\n" ;
+				case POW:
+					return "    double %TMP% = Math.pow(%IN1%, %IN2%);\n" ;
+				case MINUS1_MULT:
+					return "    double %TMP% = 1 - %IN1% * %IN2%;\n" ;
+					
+				default: 
+					throw new RuntimeException("Invalid binary type: "+this.toString());
+			}
+		}
+	}
+	
+	private final BinType _type;
+	
+	public CNodeBinary( CNode in1, CNode in2, BinType type ) {
+		//canonicalize commutative matrix-scalar operations
+		//to increase reuse potential
+		if( type.isCommutative() && in1 instanceof CNodeData 
+			&& in1.getDataType()==DataType.SCALAR ) {
+			CNode tmp = in1;
+			in1 = in2; 
+			in2 = tmp;
+		}
+		
+		_inputs.add(in1);
+		_inputs.add(in2);
+		_type = type;
+		setOutputDims();
+	}
+
+	public BinType getType() {
+		return _type;
+	}
+	
+	@Override
+	public String codegen(boolean sparse) {
+		if( _generated )
+			return "";
+			
+		StringBuilder sb = new StringBuilder();
+		
+		//generate children
+		sb.append(_inputs.get(0).codegen(sparse));
+		sb.append(_inputs.get(1).codegen(sparse));
+		
+		//generate binary operation
+		String var = createVarname();
+		String tmp = _type.getTemplate(sparse);
+		tmp = tmp.replaceAll("%TMP%", var);
+		for( int j=1; j<=2; j++ ) {
+			String varj = _inputs.get(j-1).getVarname();
+			if( sparse && !tmp.contains("%IN"+j+"%") ) {
+				tmp = tmp.replaceAll("%IN"+j+"v%", varj+"vals");
+				tmp = tmp.replaceAll("%IN"+j+"i%", varj+"ix");
+			}
+			else
+				tmp = tmp.replaceAll("%IN"+j+"%", varj );
+			
+			if(varj.startsWith("_b")  ) //i.e. b.get(index)
+				tmp = tmp.replaceAll("%POS"+j+"%", "_bi");
+			else
+				tmp = tmp.replaceAll("%POS"+j+"%", varj+"i");
+		}
+		sb.append(tmp);
+		
+		//mark as generated
+		_generated = true;
+		
+		return sb.toString();
+	}
+	
+	@Override
+	public String toString() {
+		switch(_type) {
+			case DOT_PRODUCT: return "b(dot)";
+			case VECT_MULT_ADD: return "b(vma)";
+			case VECT_DIV_ADD: return "b(vda)";
+			case MULT: return "b(*)";
+			case DIV: return "b(/)";
+			case VECT_DIV_SCALAR:  return "b(vector/)";
+			case VECT_MULT_SCALAR:  return "b(vector*)";
+			default:
+				return super.toString();	
+		}
+	}
+	
+	public void setOutputDims()
+	{
+		switch(_type) {
+			//VECT
+			case VECT_MULT_ADD: 
+			case VECT_DIV_ADD:
+				_rows = _inputs.get(1)._rows;
+				_cols = _inputs.get(1)._cols;
+				_dataType= DataType.MATRIX;
+				break;
+				
+			case VECT_DIV_SCALAR: 	
+			case VECT_MULT_SCALAR:
+				_rows = _inputs.get(0)._rows;
+				_cols = _inputs.get(0)._cols;
+				_dataType= DataType.MATRIX;
+				break;
+				
+		
+			case DOT_PRODUCT: 
+			
+			//SCALAR Arithmetic
+			case MULT: 
+			case DIV: 
+			case PLUS: 
+			case MINUS: 
+			case MINUS1_MULT:	
+			case MODULUS: 
+			case INTDIV: 	
+			//SCALAR Comparison
+			case LESS: 
+			case LESSEQUAL: 
+			case GREATER: 
+			case GREATEREQUAL: 
+			case EQUAL: 
+			case NOTEQUAL: 
+			//SCALAR LOGIC
+			case MIN: 
+			case MAX: 
+			case AND: 
+			case OR: 			
+			case LOG: 
+			case POW: 
+				_rows = 0;
+				_cols = 0;
+				_dataType= DataType.SCALAR;
+				break;	
+		}
+	}
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = _type.hashCode();
+			_hash = Arrays.hashCode(new int[]{h1,h2});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		if( !(o instanceof CNodeBinary) )
+			return false;
+		
+		CNodeBinary that = (CNodeBinary) o;
+		return super.equals(that)
+			&& _type == that._type;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
new file mode 100644
index 0000000..a9408ca
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+
+public class CNodeCell extends CNodeTpl 
+{	
+	private static final String TEMPLATE = 
+			  "package codegen;\n"
+			+ "import java.util.Arrays;\n"
+			+ "import java.io.Serializable;\n"
+			+ "import java.util.ArrayList;\n"
+			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofCellwise;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;\n"
+			+ "import org.apache.commons.math3.util.FastMath;\n"
+			+ "\n"
+			+ "public final class %TMP% extends SpoofCellwise {\n" 
+			+ "  public %TMP%() {\n"
+			+ "    _type = CellType.%TYPE%;\n"
+			+ "  }\n"
+			+ "  protected double genexecDense( double _a, double[][] _b, double[] _scalars, int _n, int _m, int _rowIndex, int _colIndex) { \n"
+			+ "%BODY_dense%"
+			+ "    return %OUT%;\n"
+			+ "  } \n"
+			+ "}";
+	
+	private CellType _type = null;
+	private boolean _multipleConsumers = false;
+	
+	public CNodeCell(ArrayList<CNode> inputs, CNode output ) {
+		super(inputs,output);
+	}
+	
+	public void setMultipleConsumers(boolean flag) {
+		_multipleConsumers = flag;
+	}
+	
+	public boolean hasMultipleConsumers() {
+		return _multipleConsumers;
+	}
+	
+	public void setCellType(CellType type) {
+		_type = type;
+		_hash = 0;
+	}
+	
+	public CellType getCellType() {
+		return _type;
+	}
+	
+	@Override
+	public String codegen(boolean sparse) {
+		String tmp = TEMPLATE;
+		
+		//rename inputs
+		rReplaceDataNode(_output, _inputs.get(0), "_a");
+		renameInputs(_inputs, 1);
+		
+		//generate dense/sparse bodies
+		String tmpDense = _output.codegen(false);
+		_output.resetGenerated();
+
+		tmp = tmp.replaceAll("%TMP%", createVarname());
+		tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+		
+		//return last TMP
+		tmp = tmp.replaceAll("%OUT%", getCurrentVarName());
+		
+		//replace aggregate information
+		tmp = tmp.replaceAll("%TYPE%", getCellType().toString());
+		
+		return tmp;
+	}
+
+	@Override
+	public void setOutputDims() {
+		
+		
+	}
+
+	@Override
+	public CNodeTpl clone() {
+		CNodeCell tmp = new CNodeCell(_inputs, _output);
+		tmp.setDataType(getDataType());
+		tmp.setCellType(getCellType());
+		tmp.setMultipleConsumers(hasMultipleConsumers());
+		return tmp;
+	}
+	
+	@Override
+	public SpoofOutputDimsType getOutputDimType() {
+		switch( _type ) {
+			case NO_AGG: return SpoofOutputDimsType.INPUT_DIMS;
+			case ROW_AGG: return SpoofOutputDimsType.ROW_DIMS;
+			case FULL_AGG: return SpoofOutputDimsType.SCALAR;
+			default:
+				throw new RuntimeException("Unsupported cell type: "+_type.toString());
+		}
+	}
+
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = _type.hashCode();
+			//note: _multipleConsumers irrelevant for plan comparison
+			_hash = Arrays.hashCode(new int[]{h1,h2});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		if(!(o instanceof CNodeCell))
+			return false;
+		
+		CNodeCell that = (CNodeCell)o;
+		return super.equals(that)
+			&& _type == that._type;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
new file mode 100644
index 0000000..d5457e8
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.Arrays;
+
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.parser.Expression.DataType;
+
+public class CNodeData extends CNode 
+{
+	protected final String _name;
+	protected final long _hopID;
+	
+	public CNodeData(Hop hop) {
+		this(hop, hop.getDim1(), hop.getDim2(), hop.getDataType());
+	}
+	
+	public CNodeData(Hop hop, long rows, long cols, DataType dt) {
+		//note: previous rewrites might have created hops with equal name
+		//hence, we also keep the hopID to uniquely identify inputs
+		_name = hop.getName();
+		_hopID = hop.getHopID();
+		_rows = rows;
+		_cols = cols;
+		_dataType = dt;
+	}
+	
+	public CNodeData(CNodeData node, String newName) {
+		_name = newName;
+		_hopID = node.getHopID();
+		_rows = node.getNumRows();
+		_cols = node.getNumCols();
+		_dataType = node.getDataType();
+	}
+		
+	@Override
+	public String getVarname() {
+		return _name;
+	}
+	
+	public long getHopID() {
+		return _hopID;
+	}
+	
+	@Override
+	public String codegen(boolean sparse) {
+		return "";
+	}
+
+	@Override
+	public void setOutputDims() {
+		
+	}
+	
+	@Override
+	public String toString() {
+		return "CdataNode[name="+_name+", id="+_hopID+"]";
+	}
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = isLiteral() ? _name.hashCode() : 0;
+			_hash = Arrays.hashCode(new int[]{h1,h2});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		return (o instanceof CNodeData 
+			&& super.equals(o)
+			&& (!isLiteral() || _name.equals(((CNodeData)o)._name)));
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
new file mode 100644
index 0000000..8c2e38c
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;
+
+
+public class CNodeOuterProduct extends CNodeTpl
+{	
+	private static final String TEMPLATE = 
+			  "package codegen;\n"
+			+ "import java.util.Arrays;\n"
+			+ "import java.util.ArrayList;\n"
+			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofOuterProduct;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;\n"
+			+ "import org.apache.commons.math3.util.FastMath;\n"
+			+ "\n"
+			+ "public final class %TMP% extends SpoofOuterProduct { \n"
+			+ "  public %TMP%() {\n"
+			+ "    _outerProductType = OutProdType.%TYPE%;\n"
+			+ "  }\n"
+			+ "  protected void genexecDense( double _a, double[] _a1, int _a1i, double[] _a2, int _a2i, double[][] _b, double[] _scalars, double[] _c, int _ci, int _n, int _m, int _k, int _rowIndex, int _colIndex) { \n"
+			+ "%BODY_dense%"
+			+ "  } \n"
+			+ "  protected double genexecCellwise( double _a, double[] _a1, int _a1i, double[] _a2, int _a2i, double[][] _b, double[] _scalars, int _n, int _m, int _k, int _rowIndex, int _colIndex) { \n"
+			+ "%BODY_cellwise%"
+			+ "    return %OUT_cellwise%;\n"
+			+ "  } \n"
+			
+			+ "}";
+	
+	private OutProdType _type = null;
+	private boolean _transposeOutput = false;
+	
+	public CNodeOuterProduct(ArrayList<CNode> inputs, CNode output ) {
+		super(inputs,output);
+	}
+	
+	@Override
+	public String codegen(boolean sparse) {
+		// note: ignore sparse flag, generate both
+		String tmp = TEMPLATE;
+		
+		//rename inputs
+		rReplaceDataNode(_output, _inputs.get(0), "_a");
+		rReplaceDataNode(_output, _inputs.get(1), "_a1"); // u
+		rReplaceDataNode(_output, _inputs.get(2), "_a2"); // v
+		renameInputs(_inputs, 3);
+
+		//generate dense/sparse bodies
+		String tmpDense = _output.codegen(false);
+		_output.resetGenerated();
+
+		tmp = tmp.replaceAll("%TMP%", createVarname());
+
+		if(_type == OutProdType.LEFT_OUTER_PRODUCT || _type == OutProdType.RIGHT_OUTER_PRODUCT) {
+			tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+			tmp = tmp.replaceAll("%OUT%", "_c");
+			tmp = tmp.replaceAll("%BODY_cellwise%", "");
+			tmp = tmp.replaceAll("%OUT_cellwise%", "0");
+		}
+		else {
+			tmp = tmp.replaceAll("%BODY_dense%", "");
+			tmp = tmp.replaceAll("%BODY_cellwise%", tmpDense);
+			tmp = tmp.replaceAll("%OUT_cellwise%", getCurrentVarName());
+		}
+		//replace size information
+		tmp = tmp.replaceAll("%LEN%", "_k");
+		
+		tmp = tmp.replaceAll("%POSOUT%", "_ci");
+		
+		tmp = tmp.replaceAll("%TYPE%", _type.toString());
+
+		return tmp;
+	}
+
+	public void setOutProdType(OutProdType type) {
+		_type = type;
+		_hash = 0;
+	}
+	
+	public OutProdType getOutProdType() {
+		return _type;
+	}
+
+	@Override
+	public void setOutputDims() {
+		
+	}
+
+	public void setTransposeOutput(boolean transposeOutput) {
+		_transposeOutput = transposeOutput;
+		_hash = 0;
+	}
+
+	
+	public boolean isTransposeOutput() {
+		return _transposeOutput;
+	}
+
+	@Override
+	public SpoofOutputDimsType getOutputDimType() {
+		switch( _type ) {
+			case LEFT_OUTER_PRODUCT:
+				return SpoofOutputDimsType.COLUMN_RANK_DIMS;
+			case RIGHT_OUTER_PRODUCT:
+				return SpoofOutputDimsType.ROW_RANK_DIMS;
+			case CELLWISE_OUTER_PRODUCT:
+				return SpoofOutputDimsType.INPUT_DIMS;
+			case AGG_OUTER_PRODUCT:
+				return SpoofOutputDimsType.SCALAR;
+			default:
+				throw new RuntimeException("Unsupported outer product type: "+_type.toString());
+		}
+	}
+
+	@Override
+	public CNodeTpl clone() {
+		return new CNodeOuterProduct(_inputs, _output);
+	}
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = _type.hashCode();
+			int h3 = Boolean.hashCode(_transposeOutput);
+			_hash = Arrays.hashCode(new int[]{h1,h2,h3});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		if(!(o instanceof CNodeOuterProduct))
+			return false;
+		
+		CNodeOuterProduct that = (CNodeOuterProduct)o;
+		return super.equals(that)
+			&& _type == that._type
+			&& _transposeOutput == that._transposeOutput;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
new file mode 100644
index 0000000..147615f
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+
+public class CNodeRowAggVector extends CNodeTpl
+{
+	private static final String TEMPLATE = 
+			  "package codegen;\n"
+			+ "import java.util.Arrays;\n"
+			+ "import java.util.ArrayList;\n"
+			+ "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+			+ "import org.apache.sysml.runtime.codegen.SpoofRowAggregate;\n"
+			+ "\n"
+			+ "public final class %TMP% extends SpoofRowAggregate { \n"
+			+ "  public %TMP%() {\n"
+			+ "    _colVector = %FLAG%;\n"
+			+ "  }\n"
+			+ "  protected void genexecRowDense( double[] _a, int _ai, double[][] _b, double[] _scalars, double[] _c, int _len, int _rowIndex ) { \n"
+			+ "%BODY_dense%"
+			+ "  } \n"
+			+ "  protected void genexecRowSparse( double[] _avals, int[] _aix, int _ai, double[][] _b, double[] _scalars, double[] _c, int _len, int _rowIndex ) { \n"
+			+ "%BODY_sparse%"
+			+ "  } \n"			
+			+ "}\n";
+
+	public CNodeRowAggVector(ArrayList<CNode> inputs, CNode output ) {
+		super(inputs, output);
+	}
+	
+	
+	@Override
+	public String codegen(boolean sparse) {
+		// note: ignore sparse flag, generate both
+		String tmp = TEMPLATE;
+		
+		//rename inputs
+		rReplaceDataNode(_output, _inputs.get(0), "_a"); // input matrix
+		renameInputs(_inputs, 1);
+		
+		//generate dense/sparse bodies
+		String tmpDense = _output.codegen(false);
+		_output.resetGenerated();
+		String tmpSparse = _output.codegen(true);
+		tmp = tmp.replaceAll("%TMP%", createVarname());
+		tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+		tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
+		
+		//replace outputs 
+		tmp = tmp.replaceAll("%OUT%", "_c");
+		tmp = tmp.replaceAll("%POSOUT%", "0");
+		
+		//replace size information
+		tmp = tmp.replaceAll("%LEN%", "_len");
+		
+		//replace colvector information and start position
+		tmp = tmp.replaceAll("%FLAG%", String.valueOf(_output._cols==1));
+		tmp = tmp.replaceAll("_bi", "0");
+		
+		return tmp;
+	}
+
+	@Override
+	public void setOutputDims() {
+		// TODO Auto-generated method stub
+		
+	}
+
+	@Override
+	public SpoofOutputDimsType getOutputDimType() {
+		return (_output._cols==1) ? 
+			SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
+			SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
+	}
+	
+	@Override
+	public CNodeTpl clone() {
+		return new CNodeRowAggVector(_inputs, _output);
+	}
+	
+	@Override
+	public int hashCode() {
+		return super.hashCode();
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		return (o instanceof CNodeRowAggVector
+			&& super.equals(o));
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
new file mode 100644
index 0000000..719770b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.parser.Expression.DataType;
+
+public abstract class CNodeTpl extends CNode implements Cloneable
+{
+	public CNodeTpl(ArrayList<CNode> inputs, CNode output ) {
+		if(inputs.size() < 1)
+			throw new RuntimeException("Cannot pass empty inputs to the CNodeTpl");
+
+		for(CNode input : inputs)
+			addInput(input);
+		_output = output;	
+	}
+	
+	public void addInput(CNode in) {
+		//check for duplicate entries or literals
+		if( containsInput(in) || in.isLiteral() )
+			return;
+		
+		_inputs.add(in);
+	}
+	
+	public void cleanupInputs(HashSet<Long> filter) {
+		ArrayList<CNode> tmp = new ArrayList<CNode>();
+		for( CNode in : _inputs )
+			if( in instanceof CNodeData && filter.contains(((CNodeData) in).getHopID()) )
+				tmp.add(in);
+		_inputs = tmp;
+	}
+	
+	public String codegen() {
+		return codegen(false);
+	}
+	
+	public abstract CNodeTpl clone();
+	
+	public abstract SpoofOutputDimsType getOutputDimType();
+	
+	protected void renameInputs(ArrayList<CNode> inputs, int startIndex) {
+		//create map of hopID to data nodes with new names, used for CSE
+		HashMap<Long, CNode> nodes = new HashMap<Long, CNode>();
+		for(int i=startIndex, sPos=0, mPos=0; i < inputs.size(); i++) {
+			CNode cnode = inputs.get(i);
+			if( !(cnode instanceof CNodeData) || ((CNodeData)cnode).isLiteral())
+				continue;
+			CNodeData cdata = (CNodeData)cnode;
+			if( cdata.getDataType() == DataType.SCALAR  || ( cdata.getNumCols() == 0 && cdata.getNumRows() == 0) ) 
+				nodes.put(cdata.getHopID(), new CNodeData(cdata, "_scalars["+ mPos++ +"]"));
+			else
+				nodes.put(cdata.getHopID(), new CNodeData(cdata, "_b["+ sPos++ +"]"));
+		}
+		
+		//single pass to replace all names
+		rReplaceDataNode(_output, nodes, new HashMap<Long, CNode>());
+	}
+	
+	protected void rReplaceDataNode( CNode root, CNode input, String newName ) {
+		if( !(input instanceof CNodeData) )
+			return;
+			
+		//create temporary name mapping
+		HashMap<Long, CNode> names = new HashMap<Long, CNode>();
+		CNodeData tmp = (CNodeData)input;
+		names.put(tmp.getHopID(), new CNodeData(tmp, newName));
+		
+		rReplaceDataNode(root, names, new HashMap<Long,CNode>());
+	}
+	
+	/**
+	 * Recursively searches for data nodes and replaces them if found.
+	 * 
+	 * @param node current node in recursive descend
+	 * @param dnodes prepared data nodes, identified by own hop id
+	 * @param lnodes memoized lookup nodes, identified by data node hop id
+	 */
+	protected void rReplaceDataNode( CNode node, HashMap<Long, CNode> dnodes, HashMap<Long, CNode> lnodes ) 
+	{	
+		for( int i=0; i<node._inputs.size(); i++ ) {
+			//recursively process children
+			rReplaceDataNode(node._inputs.get(i), dnodes, lnodes);
+			
+			//replace leaf data node
+			if( node._inputs.get(i) instanceof CNodeData ) {
+				CNodeData tmp = (CNodeData)node._inputs.get(i);
+				if( dnodes.containsKey(tmp.getHopID()) )
+					node._inputs.set(i, dnodes.get(tmp.getHopID()));
+			}
+			
+			//replace lookup on top of leaf data node
+			if( node._inputs.get(i) instanceof CNodeUnary
+				&& ((CNodeUnary)node._inputs.get(i)).getType()==UnaryType.LOOKUP) {
+				CNodeData tmp = (CNodeData)node._inputs.get(i)._inputs.get(0);
+				if( !lnodes.containsKey(tmp.getHopID()) )
+					lnodes.put(tmp.getHopID(), node._inputs.get(i));
+				else
+					node._inputs.set(i, lnodes.get(tmp.getHopID()));	
+			}
+		}
+	}
+	
+	public void rReplaceDataNode( CNode node, long hopID, CNode newNode ) 
+	{
+		for( int i=0; i<node._inputs.size(); i++ ) {
+			//replace leaf node
+			if( node._inputs.get(i) instanceof CNodeData ) {
+				CNodeData tmp = (CNodeData)node._inputs.get(i);
+				if( tmp.getHopID() == hopID )
+					node._inputs.set(i, newNode);
+			}
+			//recursively process children
+			rReplaceDataNode(node._inputs.get(i), hopID, newNode);
+			
+			//remove unnecessary lookups
+			if( node._inputs.get(i) instanceof CNodeUnary 
+				&& ((CNodeUnary)node._inputs.get(i)).getType()==UnaryType.LOOKUP 
+				&& node._inputs.get(i)._inputs.get(0).getDataType()==DataType.SCALAR)
+				node._inputs.set(i, node._inputs.get(i)._inputs.get(0));
+		}
+	}
+	
+	public void rInsertLookupNode( CNode node, long hopID, HashMap<Long, CNode> memo ) 
+	{
+		for( int i=0; i<node._inputs.size(); i++ ) {
+			//recursively process children
+			rInsertLookupNode(node._inputs.get(i), hopID, memo);
+			
+			//replace leaf node
+			if( node._inputs.get(i) instanceof CNodeData ) {
+				CNodeData tmp = (CNodeData)node._inputs.get(i);
+				if( tmp.getHopID() == hopID ) {
+					//use memo structure to retain DAG structure
+					CNode lookup = memo.get(hopID);
+					if( lookup == null ) {
+						lookup = new CNodeUnary(tmp, UnaryType.LOOKUP);
+						memo.put(hopID, lookup);
+					}
+					node._inputs.set(i, lookup);
+				}
+			}
+		}
+	}
+	
+	/**
+	 * Checks for duplicates (object ref or varname).
+	 * 
+	 * @param input new input node
+	 * @return true if duplicate, false otherwise
+	 */
+	private boolean containsInput(CNode input) {
+		if( !(input instanceof CNodeData) )
+			return false;
+		
+		CNodeData input2 = (CNodeData)input;
+		for( CNode cnode : _inputs ) {
+			if( !(cnode instanceof CNodeData) )
+				continue;
+			CNodeData cnode2 = (CNodeData)cnode;
+			if( cnode2._name.equals(input2._name) && cnode2._hopID==input2._hopID )
+				return true;
+		}
+		
+		return false;
+	}
+	
+	@Override
+	public int hashCode() {
+		return super.hashCode();
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		return (o instanceof CNodeTpl
+			&& super.equals(o));
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
new file mode 100644
index 0000000..f08769e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.Arrays;
+
+import org.apache.sysml.parser.Expression.DataType;
+
+
+public class CNodeUnary extends CNode
+{
+	public enum UnaryType {
+		ROW_SUMS, LOOKUP, LOOKUP0,
+		EXP, POW2, MULT2, SQRT, LOG,
+		ABS, ROUND, CEIL,FLOOR, SIGN, 
+		SIN, COS, TAN, ASIN, ACOS, ATAN,
+		IQM, STOP,
+		DOTPRODUCT_ROW_SUMS; //row sums via dot product for debugging purposes
+		
+		public static boolean contains(String value) {
+			for( UnaryType ut : values()  )
+				if( ut.toString().equals(value) )
+					return true;
+			return false;
+		}
+		
+		public String getTemplate(boolean sparse) {
+			switch (this) {
+				case ROW_SUMS:
+					return sparse ? "    double %TMP% = LibSpoofPrimitives.vectSum( %IN1v%, %IN1i%, %POS1%, %LEN%);\n": 
+									"    double %TMP% = LibSpoofPrimitives.vectSum( %IN1%, %POS1%,  %LEN%);\n"; 
+				case EXP:
+					return "    double %TMP% = FastMath.exp(%IN1%);\n";
+			    case LOOKUP:
+					return "    double %TMP% = %IN1%[_rowIndex];\n" ;
+				case LOOKUP0:
+					return "    double %TMP% = %IN1%[0];\n" ;
+				case POW2:
+					return "    double %TMP% = %IN1% * %IN1%;\n" ;
+				case MULT2:
+					return "    double %TMP% = %IN1% + %IN1%;\n" ;
+				case ABS:
+					return "    double %TMP% = Math.abs(%IN1%);\n";
+				case SIN:
+					return "    double %TMP% = Math.sin(%IN1%);\n";
+				case COS: 
+					return "    double %TMP% = Math.cos(%IN1%);\n";
+				case TAN:
+					return "    double %TMP% = Math.tan(%IN1%);\n";
+				case ASIN:
+					return "    double %TMP% = Math.asin(%IN1%);\n";
+				case ACOS:
+					return "    double %TMP% = Math.acos(%IN1%);\n";
+				case ATAN:
+					return "    double %TMP% = Math.atan(%IN1%);\n";
+				case SIGN:
+					return "    double %TMP% = Math.signum(%IN1%);\n";
+				case SQRT:
+					return "    double %TMP% = Math.sqrt(%IN1%);\n";
+				case LOG:
+					return "    double %TMP% = FastMath.log(%IN1%);\n";
+				case ROUND: 
+					return "    double %TMP% = Math.round(%IN1%);\n";
+				case CEIL:
+					return "    double %TMP% = Math.ceil(%IN1%);\n";
+				case FLOOR:
+					return "    double %TMP% = Math.floor(%IN1%);\n";
+				default: 
+					throw new RuntimeException("Invalid binary type: "+this.toString());
+			}
+		}
+	}
+	
+	private final UnaryType _type;
+	
+	public CNodeUnary( CNode in1, UnaryType type ) {
+		_inputs.add(in1);
+		_type = type;
+		setOutputDims();
+	}
+	
+	public UnaryType getType() {
+		return _type;
+	}
+
+	@Override
+	public String codegen(boolean sparse) {
+		if( _generated )
+			return "";
+			
+		StringBuilder sb = new StringBuilder();
+		
+		//generate children
+		sb.append(_inputs.get(0).codegen(sparse));
+		
+		//generate binary operation
+		String var = createVarname();
+		String tmp = _type.getTemplate(sparse);
+		tmp = tmp.replaceAll("%TMP%", var);
+		
+		String varj = _inputs.get(0).getVarname();
+		if( sparse && !tmp.contains("%IN1%") ) {
+			tmp = tmp.replaceAll("%IN1v%", varj+"vals");
+			tmp = tmp.replaceAll("%IN1i%", varj+"ix");
+		}
+		else
+			tmp = tmp.replaceAll("%IN1%", varj );
+		
+		if(varj.startsWith("_b")  ) //i.e. b.get(index)
+		{
+			tmp = tmp.replaceAll("%POS1%", "_bi");
+			tmp = tmp.replaceAll("%POS2%", "_bi");
+		}
+		tmp = tmp.replaceAll("%POS1%", varj+"i");
+		tmp = tmp.replaceAll("%POS2%", varj+"i");
+		
+		sb.append(tmp);
+		
+		//mark as generated
+		_generated = true;
+		
+		return sb.toString();
+	}
+	
+	@Override
+	public String toString() {
+		switch(_type) {
+			case ROW_SUMS: return "u(R+)";
+			default:
+				return super.toString();
+		}
+	}
+
+	@Override
+	public void setOutputDims() {
+		switch(_type)
+		{
+			case ROW_SUMS:
+			case EXP:
+			case LOOKUP:
+			case LOOKUP0:	
+			case POW2:
+			case MULT2:	
+			case ABS:
+			case SIN:
+			case COS: 
+			case TAN:
+			case ASIN:
+			case ACOS:
+			case ATAN:
+			case SIGN:
+			case SQRT:
+			case LOG:
+			case ROUND: 
+			case IQM:
+			case STOP:
+			case CEIL:
+			case FLOOR:
+				_rows = 0;
+				_cols = 0;
+				_dataType= DataType.SCALAR;
+				break;
+			default:
+				throw new RuntimeException("Operation " + _type.toString() + " has no "
+					+ "output dimensions, dimensions needs to be specified for the CNode " );
+		}
+		
+	}
+	
+	@Override
+	public int hashCode() {
+		if( _hash == 0 ) {
+			int h1 = super.hashCode();
+			int h2 = _type.hashCode();
+			_hash = Arrays.hashCode(new int[]{h1,h2});
+		}
+		return _hash;
+	}
+	
+	@Override 
+	public boolean equals(Object o) {
+		if( !(o instanceof CNodeUnary) )
+			return false;
+		
+		CNodeUnary that = (CNodeUnary) o;
+		return super.equals(that)
+			&& _type == that._type;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java b/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
new file mode 100644
index 0000000..4b7ecbf
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+
+import org.apache.sysml.api.DMLException;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public abstract class BaseTpl 
+{	
+	public enum TemplateType {
+		CellTpl,
+		OuterProductTpl,
+		RowAggTpl
+	}
+	
+	private TemplateType _type = null;
+	
+	protected ArrayList<Hop> _matrixInputs = new ArrayList<Hop>();
+	protected Hop _initialHop;
+	protected Hop _endHop;
+	protected ArrayList<CNodeData> _initialCnodes = new ArrayList<CNodeData>();
+	protected ArrayList<Hop> _adddedMatrices = new ArrayList<Hop>();
+	protected boolean _endHopReached = false;
+
+	protected LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> _cpplans = new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
+	
+	protected BaseTpl(TemplateType type) {
+		_type = type;
+	}
+	
+	public TemplateType getType() {
+		return _type;
+	}
+	
+	public abstract boolean openTpl(Hop hop);
+
+	public abstract boolean findTplBoundaries(Hop initialHop, CplanRegister cplanRegister);
+	
+	public abstract LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> constructTplCplan(boolean compileLiterals) throws DMLException;
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d7fd5879/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java b/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
new file mode 100644
index 0000000..0c841e8
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map.Entry;
+
+import org.apache.sysml.api.DMLException;
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.OpOp2;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeCell;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public class CellTpl extends BaseTpl 
+{
+	
+	public CellTpl() {
+		super(TemplateType.CellTpl);
+	}
+	
+	@Override
+	public boolean openTpl(Hop hop) {
+		return isValidOperation(hop);
+	}
+
+	@Override
+	public boolean findTplBoundaries(Hop initialHop, CplanRegister cplanRegister) {
+		_initialHop = initialHop;
+		rFindCellwisePattern(initialHop, new HashMap<Long, Hop>());
+		
+		//if cplanRegister has the initial hop then no need to reconstruct
+		if(cplanRegister.containsHop(TemplateType.CellTpl, _initialHop.getHopID()))
+			return false;
+			
+		//re-assign initialHop to fuse the sum/rowsums (before checking for chains)
+		for (Hop h : _initialHop.getParent())
+			if( h instanceof AggUnaryOp && ((AggUnaryOp) h).getOp() == AggOp.SUM 
+				&& ((AggUnaryOp) h).getDirection()!= Direction.Col ) {
+				_initialHop = h;  
+			}
+		
+		//unary matrix && endHop found && endHop is not direct child of the initialHop (i.e., chain of operators)
+		if(_endHop != null && _endHop != _initialHop)
+		{
+			
+			// if final hop is unary add its child to the input 
+			if(_endHop instanceof UnaryOp)
+				_matrixInputs.add(_endHop.getInput().get(0));
+			//if one input is scalar then add the other as major input
+			else if(_endHop.getInput().get(0).getDataType() == DataType.SCALAR)
+				_matrixInputs.add(_endHop.getInput().get(1));
+			else if(_endHop.getInput().get(1).getDataType() == DataType.SCALAR)
+				_matrixInputs.add(_endHop.getInput().get(0));
+			//if one is matrix and the other is vector add the matrix
+			else if(TemplateUtils.isMatrix(_endHop.getInput().get(0)) && TemplateUtils.isVector(_endHop.getInput().get(1)) )
+				_matrixInputs.add(_endHop.getInput().get(0));
+			else if(TemplateUtils.isMatrix(_endHop.getInput().get(1)) && TemplateUtils.isVector(_endHop.getInput().get(0)) )
+				_matrixInputs.add(_endHop.getInput().get(1));
+			//both are vectors (add any of them)
+			else
+				_matrixInputs.add(_endHop.getInput().get(0));
+				
+			return true;
+		}
+		
+		return false;
+	}
+	
+	private void rFindCellwisePattern(Hop h, HashMap<Long,Hop> memo)
+	{
+		if(memo.containsKey(h.getHopID()))
+			return;
+		
+		//stop recursion if stopping operator
+		if(h.getDataType() == DataType.SCALAR || !isValidOperation(h))
+			return;
+		
+		//process childs recursively
+		_endHop = h;
+		for( Hop in : h.getInput() )
+		{
+			//propagate the _endHop from bottom to top
+			if(memo.containsKey(in.getHopID()))
+				_endHop=memo.get(in.getHopID());
+			else
+				rFindCellwisePattern(in,memo);
+		}
+	
+		memo.put(h.getHopID(), _endHop);	
+	}
+
+	@Override
+	public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> constructTplCplan(boolean compileLiterals)
+			throws DMLException {
+		//re-assign the dimensions of inputs to match the generated code dimensions
+		_initialCnodes.add(new CNodeData(_matrixInputs.get(0), 1, 1, DataType.SCALAR));
+		
+		rConstructCellCplan(_initialHop,_initialHop, new HashSet<Long>(), compileLiterals);
+		return _cpplans;
+	}
+	
+	public CNode fuseCellWise(Hop initialHop,Hop matrixInput, boolean compileLiterals)
+			throws DMLException {
+		//re-assign the dimensions of inputs to match the generated code dimensions
+		_initialHop = initialHop;
+		_matrixInputs.add(matrixInput);
+		
+		constructTplCplan(compileLiterals);
+		Entry<Long, Pair<Hop[],CNodeTpl>> toplevel = TemplateUtils.getTopLevelCpplan(_cpplans);
+		if(toplevel != null)
+			return toplevel.getValue().getValue().getOutput();
+		else 
+			return null;
+	}
+	
+	private void rConstructCellCplan(Hop root, Hop hop, HashSet<Long> memo, boolean compileLiterals) 
+		throws DMLException
+	{
+		if( memo.contains(hop.getHopID()) )
+			return;
+		
+		
+		//process childs recursively
+		for( Hop c : hop.getInput() )
+			rConstructCellCplan(root, c, memo, compileLiterals);
+		
+		 // first hop to enter here should be _endHop
+		if(TemplateUtils.inputsAreGenerated(hop,_matrixInputs,_cpplans))  
+		// if direct children are DataGenOps, literals, or already in the cpplans then we are ready to generate code
+		{
+			CNodeCell cellTmpl = null;
+			
+			//Fetch operands
+			CNode out = null;
+			ArrayList<CNode> addedCNodes = new ArrayList<CNode>();
+			ArrayList<Hop> addedHops = new ArrayList<Hop>();
+			ArrayList<CNode> cnodeData = TemplateUtils.fetchOperands(hop, _cpplans, addedCNodes, addedHops, _initialCnodes, compileLiterals);
+			
+			//if operands are scalar or independent from X 
+			boolean independentOperands = hop != root && (hop.getDataType() == DataType.SCALAR || TemplateUtils.isOperandsIndependent(cnodeData, addedHops, new String[] {_matrixInputs.get(0).getName()}));
+			if(!independentOperands)
+			{
+				if(hop instanceof UnaryOp)
+				{
+					CNode cdata1 = cnodeData.get(0);
+					
+					//Primitive Operation haas the same name as Hop Type OpOp1
+					String primitiveOpName = ((UnaryOp)hop).getOp().toString();
+					out = new CNodeUnary(cdata1, UnaryType.valueOf(primitiveOpName));
+				}
+				else if(hop instanceof BinaryOp)
+				{
+					BinaryOp bop = (BinaryOp) hop;
+					CNode cdata1 = cnodeData.get(0);
+					CNode cdata2 = cnodeData.get(1);
+					
+					//Primitive Operation has the same name as Hop Type OpOp2
+					String primitiveOpName = bop.getOp().toString();
+					
+					//cdata1 is vector
+					if( TemplateUtils.isColVector(cdata1) )
+						cdata1 = new CNodeUnary(cdata1, UnaryType.LOOKUP);
+					
+					//cdata2 is vector
+					if( TemplateUtils.isColVector(cdata2) )
+						cdata2 = new CNodeUnary(cdata2, UnaryType.LOOKUP);
+					
+					
+					if( bop.getOp()==OpOp2.POW && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
+						out = new CNodeUnary(cdata1, UnaryType.POW2);
+					else if( bop.getOp()==OpOp2.MULT && cdata2.isLiteral() && cdata2.getVarname().equals("2") )
+						out = new CNodeUnary(cdata1, UnaryType.MULT2);
+					else //default binary	
+						out = new CNodeBinary(cdata1, cdata2, BinType.valueOf(primitiveOpName));
+				}
+				else if (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getOp() == AggOp.SUM
+					&& (((AggUnaryOp) hop).getDirection() == Direction.RowCol 
+					|| ((AggUnaryOp) hop).getDirection() == Direction.Row) && root == hop)
+				{
+					out = cnodeData.get(0);
+				}
+			}
+			// wire output to the template
+			if(out != null || independentOperands)
+			{
+				if(_cpplans.isEmpty())
+				{
+					//first initialization has to have the first variable as input
+					ArrayList<CNode> initialInputs = new ArrayList<CNode>();					
+					
+					if(independentOperands) // pass the hop itself as an input instead of its children
+					{
+						CNode c =  new CNodeData(hop);
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.add(c);
+						cellTmpl = new CNodeCell(initialInputs, c); 
+						cellTmpl.setDataType(hop.getDataType());
+						cellTmpl.setCellType(CellType.NO_AGG);
+						cellTmpl.setMultipleConsumers(hop.getParent().size()>1);
+						
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(new Hop[] {_matrixInputs.get(0),hop} ,cellTmpl));
+					}
+					else
+					{
+						initialInputs.addAll(_initialCnodes);
+						initialInputs.addAll(cnodeData);
+						cellTmpl =  new CNodeCell(initialInputs, out); 
+						cellTmpl.setDataType(hop.getDataType());
+						cellTmpl.setCellType(CellType.NO_AGG);
+						cellTmpl.setMultipleConsumers(hop.getParent().size()>1);
+						
+						//Hop[] hopArray = new Hop[hop.getInput().size()+1];
+						Hop[] hopArray = new Hop[addedHops.size()+1];
+						hopArray[0] = _matrixInputs.get(0);
+						
+						//System.arraycopy( hop.getInput().toArray(), 0, hopArray, 1, hop.getInput().size());
+						System.arraycopy( addedHops.toArray(), 0, hopArray, 1, addedHops.size());
+						
+						_cpplans.put(hop.getHopID(), new Pair<Hop[],CNodeTpl>(hopArray,cellTmpl));
+					}
+				}
+				else
+				{
+					if(independentOperands)
+					{
+						CNode c =  new CNodeData(hop);
+						//clear Operands
+						addedCNodes.clear();
+						addedHops.clear();
+						
+						//added the current hop as the input
+						addedCNodes.add(c);
+						addedHops.add(hop);
+						out = c;
+					}
+					//wire the output to existing or new template	
+					TemplateUtils.setOutputToExistingTemplate(hop, out, _cpplans, addedCNodes, addedHops);
+				}
+			}
+			memo.add(hop.getHopID());
+		}
+	}
+
+	private boolean isValidOperation(Hop hop) {
+		boolean isBinaryMatrixScalar = hop instanceof BinaryOp && hop.getDataType()==DataType.MATRIX &&
+			(hop.getInput().get(0).getDataType()==DataType.SCALAR || hop.getInput().get(1).getDataType()==DataType.SCALAR);	
+		boolean isBinaryMatrixVector = hop instanceof BinaryOp && hop.dimsKnown() &&
+			((hop.getInput().get(0).getDataType() == DataType.MATRIX
+				&& TemplateUtils.isVectorOrScalar(hop.getInput().get(1)) && !TemplateUtils.isBinaryMatrixRowVector(hop)) 
+			||(TemplateUtils.isVectorOrScalar( hop.getInput().get(0))  
+				&& hop.getInput().get(1).getDataType() == DataType.MATRIX && !TemplateUtils.isBinaryMatrixRowVector(hop)) );
+		return hop.getDataType() == DataType.MATRIX && TemplateUtils.isOperationSupported(hop)
+			&& (hop instanceof UnaryOp || isBinaryMatrixScalar || isBinaryMatrixVector);	
+	}
+}

[5/9] incubator-systemml git commit: [SYSTEMML-1286] Code generator compiler integration, incl tests

Posted by mb...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmTransposeOut.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmTransposeOut.dml b/src/test/scripts/functions/codegen/wdivmmTransposeOut.dml
new file mode 100644
index 0000000..519d4bb
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmTransposeOut.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 0 1 2 2 0 0 3 3 0 0 4", rows=4, cols=3)
+U= matrix( "1 2 3 4 5 6 7 8", rows=4, cols=2)
+V= matrix( "9 12 10 13 11 14", rows=2, cols=3)
+
+if(1==1){}
+
+eps = 0.1
+S= (t(U) %*% (X/((U%*%V)+eps)))
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmbasic.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmbasic.R b/src/test/scripts/functions/codegen/wdivmmbasic.R
new file mode 100644
index 0000000..7343307
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmbasic.R
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( c(1,0,1,2,2,0,0,3,3,0,0,4), nrow=4, ncol=3, byrow = TRUE)
+U= matrix( c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8), nrow=4, ncol=2, byrow = TRUE)
+V= matrix( c(0.9,0.12,0.10,0.13,0.11,0.14), nrow=3, ncol=2, byrow = TRUE)
+eps = 0.1
+S= X/((U%*%t(V))+eps);
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wdivmmbasic.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wdivmmbasic.dml b/src/test/scripts/functions/codegen/wdivmmbasic.dml
new file mode 100644
index 0000000..0db04cf
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wdivmmbasic.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 0 1 2 2 0 0 3 3 0 0 4", rows=4, cols=3)
+U= matrix( "0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8", rows=4, cols=2)
+V= matrix( "0.9 0.12 0.10 0.13 0.11 0.14", rows=3, cols=2)
+
+if(1==1){}
+
+eps = 0.1
+S= X/((U%*%t(V))+eps);
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wsigmoid.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wsigmoid.R b/src/test/scripts/functions/codegen/wsigmoid.R
new file mode 100644
index 0000000..68ef61b
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wsigmoid.R
@@ -0,0 +1,33 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X= matrix( c(1,0,1,2,2,0,0,3,3,0,0,4), nrow=4, ncol=3, byrow = TRUE)
+U= matrix( c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8), nrow=4, ncol=2, byrow = TRUE)
+V= matrix( c(0.9,0.12,0.10,0.13,0.11,0.14), nrow=3, ncol=2, byrow = TRUE)
+eps = 0.1
+S= X*(1/(1+exp(-(U%*%t(V)))));
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/wsigmoid.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/wsigmoid.dml b/src/test/scripts/functions/codegen/wsigmoid.dml
new file mode 100644
index 0000000..78ed611
--- /dev/null
+++ b/src/test/scripts/functions/codegen/wsigmoid.dml
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X= matrix( "1 0 1 2 2 0 0 3 3 0 0 4", rows=4, cols=3)
+U= matrix( "0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8", rows=4, cols=2)
+V= matrix( "0.9 0.12 0.10 0.13 0.11 0.14", rows=3, cols=2)
+
+if(1==1){}
+
+eps = 0.1
+S= X*(1/(1+exp(-(U%*%t(V)))));
+write(S,$1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
new file mode 100644
index 0000000..644751b
--- /dev/null
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/codegen/ZPackageSuite.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.codegen;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+
+/** Group together the tests in this package into a single suite so that the Maven build
+ *  won't run two of them at once. */
+@RunWith(Suite.class)
+@Suite.SuiteClasses({
+	AlgorithmGLM.class,
+	AlgorithmKMeans.class,
+	AlgorithmL2SVM.class,
+	AlgorithmLinregCG.class,
+	AlgorithmMLogreg.class,
+	AlgorithmPNMF.class,
+	CellwiseTmplTest.class,
+	DAGCellwiseTmplTest.class,
+	OuterProdTmplTest.class,
+	RowAggTmplTest.class,
+})
+
+
+/** This class is just a holder for the above JUnit annotations. */
+public class ZPackageSuite {
+
+}