You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/01/26 19:37:13 UTC

incubator-systemml git commit: [SYSTEMML-488] Generalized wdivmm operator w/ 4 operands (all backends)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 52e95f66e -> 401e982bf


[SYSTEMML-488] Generalized wdivmm operator w/ 4 operands (all backends)

This patch extends the existing wdivmm operator (with seven patterns for
three operands) by two patterns for four operands: i.e.,
t(t(U)%*%(W*(U%*%t(V)-X))), and (W*(U%*%t(V)-X)%*%V, where W does not
need to be a non-zero indicator of X. This change also includes the
related tests, rewrites, as well as documentation changes. On sparse
data (W and X), the performance improvements scale with the sparsity due
to improved asymptotic behavior (in the number of non-zeros vs in the
dimensions size), e.g., w/ sparsity of 0.001 up to 1000x. Usecase:
GloVe.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/401e982b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/401e982b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/401e982b

Branch: refs/heads/master
Commit: 401e982bf4cbd41154d5ce217c7baebff8afff06
Parents: 52e95f6
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Mon Jan 25 23:23:44 2016 -0800
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Tue Jan 26 10:35:37 2016 -0800

----------------------------------------------------------------------
 docs/devdocs/MatrixMultiplicationOperators.txt  |   5 +-
 .../org/apache/sysml/hops/QuaternaryOp.java     |  51 +++-
 .../RewriteAlgebraicSimplificationDynamic.java  |  80 +++++-
 .../org/apache/sysml/lops/WeightedDivMM.java    |  32 ++-
 .../org/apache/sysml/lops/WeightedDivMMR.java   |  16 +-
 .../cp/QuaternaryCPInstruction.java             |  17 +-
 .../instructions/mr/QuaternaryInstruction.java  |  55 ++--
 .../spark/QuaternarySPInstruction.java          |  61 +++--
 .../runtime/matrix/data/LibMatrixMult.java      | 121 +++++++--
 .../sysml/runtime/matrix/data/MatrixBlock.java  |   5 +-
 .../matrix/operators/QuaternaryOperator.java    |   9 +
 .../quaternary/WeightedDivMatrixMultTest.java   | 264 ++++++++++---------
 .../quaternary/WeightedDivMM4MultMinusLeft.R    |  37 +++
 .../quaternary/WeightedDivMM4MultMinusLeft.dml  |  32 +++
 .../quaternary/WeightedDivMM4MultMinusRight.R   |  38 +++
 .../quaternary/WeightedDivMM4MultMinusRight.dml |  32 +++
 16 files changed, 639 insertions(+), 216 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/docs/devdocs/MatrixMultiplicationOperators.txt
----------------------------------------------------------------------
diff --git a/docs/devdocs/MatrixMultiplicationOperators.txt b/docs/devdocs/MatrixMultiplicationOperators.txt
index 962951c..f10672d 100644
--- a/docs/devdocs/MatrixMultiplicationOperators.txt
+++ b/docs/devdocs/MatrixMultiplicationOperators.txt
@@ -114,9 +114,10 @@ C) CORE MATRIX MULT PRIMITIVES LibMatrixMult (incl related script patterns)
 * 7) wdivmm   ((a) t(t(U)%*%(W/(U%*%t(V)))), (b) (W/(U%*%t(V)))%*%V,
                (c) t(t(U)%*%(W*(U%*%t(V)))), (d) (W*(U%*%t(V)))%*%V, 
                (e) W*(U%*%t(V)), (f) t(t(U)%*%((X!=0)*(U%*%t(V)-X))),
-               (g) ((X!=0)*(U%*%t(V)-X)%*%V)
+               (g) (X!=0)*(U%*%t(V)-X)%*%V, (h) t(t(U)%*%(W*(U%*%t(V)-X))),  
+               (i) (W*(U%*%t(V)-X)%*%V
   - sequential / multi-threaded (same block ops, par over rows in X)                 
-  - all dense, sparse-dense factors, sparse/dense-* x 7 patterns
+  - all dense, sparse-dense factors, sparse/dense-* x 9 patterns
 
 * 8) wcemm    (sum(X*log(U%*%t(V))))  
   - sequential / multi-threaded (same block ops, par over rows in X)                 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
index 1f072e5..6cfd32c 100644
--- a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
@@ -132,10 +132,27 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 		_minusin = flag2;
 	}
 	
+	/**
+	 * 
+	 * @param l
+	 * @param dt
+	 * @param vt
+	 * @param o
+	 * @param inW
+	 * @param inU
+	 * @param inV
+	 * @param baseType
+	 * @param flag1
+	 * @param flag2
+	 */
 	public QuaternaryOp(String l, DataType dt, ValueType vt, Hop.OpOp4 o,
-			Hop inW, Hop inU, Hop inV, int baseType, boolean flag1, boolean flag2) 
+			Hop inX, Hop inU, Hop inV, Hop inW, int baseType, boolean flag1, boolean flag2) 
 	{
-		this(l, dt, vt, o, inW, inU, inV);
+		this(l, dt, vt, o, inX, inU, inV);
+		if( inW != null ) { //four inputs
+			getInput().add(3, inW);
+			inW.getParent().add(this);
+		}
 		
 		_baseType = baseType;
 		_mult = flag1;
@@ -769,6 +786,7 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 				getInput().get(0).constructLops(),
 				getInput().get(1).constructLops(),
 				getInput().get(2).constructLops(),
+				getInput().get(3).constructLops(),
 				getDataType(), getValueType(), wtype, ExecType.CP);
 		
 		//set degree of parallelism
@@ -796,11 +814,13 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 		Hop W = getInput().get(0);
 		Hop U = getInput().get(1);
 		Hop V = getInput().get(2);
+		Hop X = getInput().get(3);
 		
 		//MR operator selection, part1
 		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
 		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
-		boolean isMapWsloss = (m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
+		boolean isMapWsloss = (!wtype.hasFourInputs() &&
+				m1Size+m2Size < OptimizerUtils.getRemoteMemBudgetMap(true)); 
 		
 		if( !FORCE_REPLICATION && isMapWsloss ) //broadcast
 		{
@@ -823,7 +843,7 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 			}
 			
 			//map-side wsloss always with broadcast
-			Lop wdivmm = new WeightedDivMM( W.constructLops(), lU, lV, 
+			Lop wdivmm = new WeightedDivMM( W.constructLops(), lU, lV, X.constructLops(), 
 					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.MR);
 			setOutputDimensions(wdivmm);
 			setLineNumbers(wdivmm);
@@ -840,6 +860,12 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 			grpW.getOutputParameters().setDimensions(W.getDim1(), W.getDim2(), W.getRowsInBlock(), W.getColsInBlock(), W.getNnz());
 			setLineNumbers(grpW);
 			
+			Lop grpX = X.constructLops();
+			if( wtype.hasFourInputs() )
+				grpX = new Group(grpX, Group.OperationTypes.Sort, DataType.MATRIX, ValueType.DOUBLE);
+			grpX.getOutputParameters().setDimensions(X.getDim1(), X.getDim2(), X.getRowsInBlock(), X.getColsInBlock(), X.getNnz());
+			setLineNumbers(grpX);
+			
 			Lop lU = null;
 			if( cacheU ) {
 				//partitioning of U for read through distributed cache
@@ -896,7 +922,7 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 			}
 			
 			//reduce-side wsloss w/ or without broadcast
-			Lop wdivmm = new WeightedDivMMR( grpW, lU, lV, 
+			Lop wdivmm = new WeightedDivMMR( grpW, lU, lV, grpX, 
 					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.MR);
 			setOutputDimensions(wdivmm);
 			setLineNumbers(wdivmm);
@@ -939,18 +965,19 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 		Hop W = getInput().get(0);
 		Hop U = getInput().get(1);
 		Hop V = getInput().get(2);
+		Hop X = getInput().get(3);
 		
 		//MR operator selection, part1
 		double m1Size = OptimizerUtils.estimateSize(U.getDim1(), U.getDim2()); //size U
 		double m2Size = OptimizerUtils.estimateSize(V.getDim1(), V.getDim2()); //size V
-		boolean isMapWsloss = (m1Size+m2Size < memBudgetExec
+		boolean isMapWsloss = (!wtype.hasFourInputs() && m1Size+m2Size < memBudgetExec
 				&& 2*m1Size<memBudgetLocal && 2*m2Size<memBudgetLocal); 
 		
 		if( !FORCE_REPLICATION && isMapWsloss ) //broadcast
 		{
 			//map-side wsloss always with broadcast
 			Lop wdivmm = new WeightedDivMM( W.constructLops(), U.constructLops(), V.constructLops(), 
-					DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.SPARK);
+					X.constructLops(), DataType.MATRIX, ValueType.DOUBLE, wtype, ExecType.SPARK);
 			setOutputDimensions(wdivmm);
 			setLineNumbers(wdivmm);
 			setLops( wdivmm );
@@ -964,7 +991,7 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 			
 			//reduce-side wsloss w/ or without broadcast
 			Lop wdivmm = new WeightedDivMMR( 
-					W.constructLops(), U.constructLops(), V.constructLops(), 
+					W.constructLops(), U.constructLops(), V.constructLops(), X.constructLops(),
 					DataType.MATRIX, ValueType.DOUBLE, wtype, cacheU, cacheV, ExecType.SPARK);
 			setOutputDimensions(wdivmm);
 			setLineNumbers(wdivmm);
@@ -1468,12 +1495,16 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 			case 0: //BASIC
 				return WDivMMType.MULT_BASIC;
 			case 1: //LEFT
-				if( _minus )
+				if( getInput().get(3).getDataType()==DataType.MATRIX )
+					return WDivMMType.MULT_MINUS_4_LEFT;
+				else if( _minus )
 					return WDivMMType.MULT_MINUS_LEFT;
 				else
 					return _mult ? WDivMMType.MULT_LEFT : WDivMMType.DIV_LEFT;
 			case 2: //RIGHT	
-				if( _minus )
+				if( getInput().get(3).getDataType()==DataType.MATRIX )
+					return WDivMMType.MULT_MINUS_4_RIGHT;
+				else if( _minus )
 					return WDivMMType.MULT_MINUS_RIGHT;
 				else
 					return _mult ? WDivMMType.MULT_RIGHT : WDivMMType.DIV_RIGHT;		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index 4dd5f87..918b671 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -1858,7 +1858,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 					
 					boolean mult = ((BinaryOp)right).getOp() == OpOp2.MULT;
 					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
-							  OpOp4.WDIVMM, W, U, V, 1, mult, false);
+							  OpOp4.WDIVMM, W, U, V, new LiteralOp(-1), 1, mult, false);
 					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
 					
 					//add output transpose for efficient target indexing (redundant t() removed by other rewrites)
@@ -1890,7 +1890,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 					
 					boolean mult = ((BinaryOp)left).getOp() == OpOp2.MULT;
 					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
-							  OpOp4.WDIVMM, W, U, V, 2, mult, false);
+							  OpOp4.WDIVMM, W, U, V, new LiteralOp(-1), 2, mult, false);
 					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
 
 					appliedPattern = true;
@@ -1899,7 +1899,8 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 			}
 			
 			//Pattern 3) t(U) %*% ((X!=0)*(U%*%t(V)-X))
-			if( right instanceof BinaryOp && ((BinaryOp)right).getOp()==LOOKUP_VALID_WDIVMM_BINARY[0] //MULT
+			if( !appliedPattern
+				&& right instanceof BinaryOp && ((BinaryOp)right).getOp()==LOOKUP_VALID_WDIVMM_BINARY[0] //MULT
 				&& right.getInput().get(1) instanceof BinaryOp && ((BinaryOp)right.getInput().get(1)).getOp()==OpOp2.MINUS	
 				&& right.getInput().get(1).getInput().get(0) instanceof AggBinaryOp
                 && right.getInput().get(1).getInput().get(1).getDataType() == DataType.MATRIX
@@ -1919,7 +1920,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 						V = V.getInput().get(0);
 					
 					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
-							  OpOp4.WDIVMM, X, U, V, 1, true, true);
+							  OpOp4.WDIVMM, X, U, V, new LiteralOp(-1), 1, true, true);
 					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
 					
 					//add output transpose for efficient target indexing (redundant t() removed by other rewrites)
@@ -1952,16 +1953,79 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 						V = V.getInput().get(0);
 					
 					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
-							  OpOp4.WDIVMM, X, U, V, 2, true, true);
+							  OpOp4.WDIVMM, X, U, V, new LiteralOp(-1), 2, true, true);
 					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
 
 					appliedPattern = true;
 					LOG.debug("Applied simplifyWeightedDivMM4 (line "+hi.getBeginLine()+")");	
 				}
 			}
+			
+			//Pattern 5) t(U) %*% (W*(U%*%t(V)-X))
+			if( !appliedPattern
+				&& right instanceof BinaryOp && ((BinaryOp)right).getOp()==LOOKUP_VALID_WDIVMM_BINARY[0] //MULT
+				&& right.getInput().get(1) instanceof BinaryOp && ((BinaryOp)right.getInput().get(1)).getOp()==OpOp2.MINUS	
+				&& right.getInput().get(1).getInput().get(0) instanceof AggBinaryOp
+                && right.getInput().get(1).getInput().get(1).getDataType() == DataType.MATRIX
+				&& HopRewriteUtils.isSingleBlock(right.getInput().get(1).getInput().get(0).getInput().get(0),true) ) //BLOCKSIZE CONSTRAINT
+			{
+				Hop W = right.getInput().get(0); 
+				Hop U = right.getInput().get(1).getInput().get(0).getInput().get(0);
+				Hop V = right.getInput().get(1).getInput().get(0).getInput().get(1);
+				Hop X = right.getInput().get(1).getInput().get(1);
+				
+				if( HopRewriteUtils.isTransposeOfItself(left, U) )  //t(U)-U constraint
+				{
+					if( !HopRewriteUtils.isTransposeOperation(V) )
+						V = HopRewriteUtils.createTranspose(V);
+					else 
+						V = V.getInput().get(0);
+					
+					//note: x and w exchanged compared to patterns 1-4, 7
+					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
+							  OpOp4.WDIVMM, W, U, V, X, 1, true, true);
+					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
+					
+					//add output transpose for efficient target indexing (redundant t() removed by other rewrites)
+					hnew = HopRewriteUtils.createTranspose(hnew);
+					
+					appliedPattern = true;
+					LOG.debug("Applied simplifyWeightedDivMM5 (line "+hi.getBeginLine()+")");					
+				}
+			}	
+			
+			//Pattern 6) (W*(U%*%t(V)-X)) %*% V
+			if( !appliedPattern
+				&& left instanceof BinaryOp && ((BinaryOp)left).getOp()==LOOKUP_VALID_WDIVMM_BINARY[0] //MULT	
+				&& left.getInput().get(1) instanceof BinaryOp && ((BinaryOp)left.getInput().get(1)).getOp()==OpOp2.MINUS	
+				&& left.getInput().get(1).getInput().get(0) instanceof AggBinaryOp
+                && left.getInput().get(1).getInput().get(1).getDataType() == DataType.MATRIX
+				&& HopRewriteUtils.isSingleBlock(left.getInput().get(1).getInput().get(0).getInput().get(0),true) ) //BLOCKSIZE CONSTRAINT
+			{
+				Hop W = left.getInput().get(0); 
+				Hop U = left.getInput().get(1).getInput().get(0).getInput().get(0);
+				Hop V = left.getInput().get(1).getInput().get(0).getInput().get(1);
+				Hop X = left.getInput().get(1).getInput().get(1);
+				
+				if( HopRewriteUtils.isTransposeOfItself(right, V) )  //V-t(V) constraint
+				{
+					if( !HopRewriteUtils.isTransposeOperation(V) )
+						V = right;
+					else 
+						V = V.getInput().get(0);
+					
+					//note: x and w exchanged compared to patterns 1-4, 7
+					hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
+							  OpOp4.WDIVMM, W, U, V, X, 2, true, true);
+					HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
+
+					appliedPattern = true;
+					LOG.debug("Applied simplifyWeightedDivMM6 (line "+hi.getBeginLine()+")");	
+				}
+			}
 		}
 		
-		//Pattern 5) (W*(U%*%t(V)))
+		//Pattern 7) (W*(U%*%t(V)))
 		if( !appliedPattern
 			&& hi instanceof BinaryOp && ((BinaryOp)hi).getOp()==LOOKUP_VALID_WDIVMM_BINARY[0] //MULT	
 			&& HopRewriteUtils.isEqualSize(hi.getInput().get(0), hi.getInput().get(1)) //prevent mv
@@ -1982,11 +2046,11 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 				V = V.getInput().get(0);
 				
 			hnew = new QuaternaryOp(hi.getName(), DataType.MATRIX, ValueType.DOUBLE, 
-					  OpOp4.WDIVMM, W, U, V, 0, true, false);
+					  OpOp4.WDIVMM, W, U, V, new LiteralOp(-1), 0, true, false);
 			HopRewriteUtils.setOutputBlocksizes(hnew, W.getRowsInBlock(), W.getColsInBlock());
 
 			appliedPattern = true;
-			LOG.debug("Applied simplifyWeightedDivMM5 (line "+hi.getBeginLine()+")");	
+			LOG.debug("Applied simplifyWeightedDivMM7 (line "+hi.getBeginLine()+")");	
 		}
 		
 		//relink new hop into original position

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/lops/WeightedDivMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/WeightedDivMM.java b/src/main/java/org/apache/sysml/lops/WeightedDivMM.java
index c47418d..94f5c19 100644
--- a/src/main/java/org/apache/sysml/lops/WeightedDivMM.java
+++ b/src/main/java/org/apache/sysml/lops/WeightedDivMM.java
@@ -42,23 +42,33 @@ public class WeightedDivMM extends Lop
 		MULT_LEFT,			//t(t(U) %*% (W * U%*%t(V)))
 		MULT_RIGHT,			//(W * U%*%t(V)) %*% V
 		MULT_MINUS_LEFT,	//t(t(U) %*% ((X!=0) * (U%*%t(V) - X)))
-		MULT_MINUS_RIGHT;	//((X!=0) * (U%*%t(V) - X)) %*% V
+		MULT_MINUS_RIGHT,	//((X!=0) * (U%*%t(V) - X)) %*% V
+		MULT_MINUS_4_LEFT,	//t(t(U) %*% (W * (U%*%t(V) - X)))
+		MULT_MINUS_4_RIGHT;	//(W * (U%*%t(V) - X)) %*% V
+		
 		
 		public boolean isBasic(){
 			return (this == MULT_BASIC);
 		}
 		public boolean isLeft() {
-			return (this == DIV_LEFT || this == MULT_LEFT || this == MULT_MINUS_LEFT);
+			return (this == DIV_LEFT || this == MULT_LEFT 
+					|| this == MULT_MINUS_LEFT || this == MULT_MINUS_4_LEFT);
 		}
 		public boolean isRight() {
 			return !(isLeft() || isBasic());
 		}
 		public boolean isMult() {
-			return (this == MULT_LEFT || this == MULT_RIGHT || this == MULT_MINUS_LEFT || this == MULT_MINUS_RIGHT);
+			return (this == MULT_LEFT || this == MULT_RIGHT || this == MULT_MINUS_LEFT || this == MULT_MINUS_RIGHT
+					|| this == MULT_MINUS_4_LEFT || this == MULT_MINUS_4_RIGHT);
 		}		
 		public boolean isMinus(){
-			return (this == MULT_MINUS_LEFT || this == MULT_MINUS_RIGHT);
+			return (this == MULT_MINUS_LEFT || this == MULT_MINUS_RIGHT 
+					|| this == MULT_MINUS_4_LEFT || this == MULT_MINUS_4_RIGHT);
+		}
+		public boolean hasFourInputs() {
+			return (this == MULT_MINUS_4_LEFT || this == MULT_MINUS_4_RIGHT);
 		}
+		
 		public MatrixCharacteristics computeOutputCharacteristics(long Xrlen, long Xclen, long rank) {
 			if( isBasic() )
 				return new MatrixCharacteristics( Xrlen, Xclen, -1, -1);
@@ -69,17 +79,19 @@ public class WeightedDivMM extends Lop
 	
 	private WDivMMType _weightsType = null;
 	
-	public WeightedDivMM(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, WDivMMType wt, ExecType et) 
+	public WeightedDivMM(Lop input1, Lop input2, Lop input3, Lop input4, DataType dt, ValueType vt, WDivMMType wt, ExecType et) 
 		throws LopsException 
 	{
 		super(Lop.Type.WeightedDivMM, dt, vt);		
 		addInput(input1); //W
 		addInput(input2); //U
 		addInput(input3); //V
+		addInput(input4); //X (optional)
 		input1.addOutput(this); 
 		input2.addOutput(this);
 		input3.addOutput(this);
-
+		input4.addOutput(this);
+		
 		_weightsType = wt;
 		setupLopProperties(et);
 	}
@@ -117,18 +129,19 @@ public class WeightedDivMM extends Lop
 	
 	/* MR instruction generation */
 	@Override
-	public String getInstructions(int input1, int input2, int input3, int output)
+	public String getInstructions(int input1, int input2, int input3, int input4, int output)
 	{
 		return getInstructions(
 				String.valueOf(input1), 
 				String.valueOf(input2), 
 				String.valueOf(input3), 
+				String.valueOf(input4), 
 				String.valueOf(output));
 	}
 
 	/* CP/SPARK instruction generation */
 	@Override
-	public String getInstructions(String input1, String input2, String input3, String output)
+	public String getInstructions(String input1, String input2, String input3, String input4, String output)
 	{
 		StringBuilder sb = new StringBuilder();
 		
@@ -150,6 +163,9 @@ public class WeightedDivMM extends Lop
 		sb.append( getInputs().get(2).prepInputOperand(input3));
 		
 		sb.append(Lop.OPERAND_DELIMITOR);
+		sb.append( getInputs().get(3).prepInputOperand(input4));
+		
+		sb.append(Lop.OPERAND_DELIMITOR);
 		sb.append( prepOutputOperand(output));
 		
 		sb.append(Lop.OPERAND_DELIMITOR);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/lops/WeightedDivMMR.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/WeightedDivMMR.java b/src/main/java/org/apache/sysml/lops/WeightedDivMMR.java
index 0e7caf3..790d90d 100644
--- a/src/main/java/org/apache/sysml/lops/WeightedDivMMR.java
+++ b/src/main/java/org/apache/sysml/lops/WeightedDivMMR.java
@@ -37,17 +37,19 @@ public class WeightedDivMMR extends Lop
 	private boolean _cacheU = false;
 	private boolean _cacheV = false;
 	
-	public WeightedDivMMR(Lop input1, Lop input2, Lop input3, DataType dt, ValueType vt, WDivMMType wt, boolean cacheU, boolean cacheV, ExecType et) 
+	public WeightedDivMMR(Lop input1, Lop input2, Lop input3, Lop input4, DataType dt, ValueType vt, WDivMMType wt, boolean cacheU, boolean cacheV, ExecType et) 
 		throws LopsException 
 	{
 		super(Lop.Type.WeightedDivMM, dt, vt);		
 		addInput(input1); //W
 		addInput(input2); //U
 		addInput(input3); //V
+		addInput(input4); //X
 		input1.addOutput(this); 
 		input2.addOutput(this);
 		input3.addOutput(this);
-			
+		input4.addOutput(this);	
+		
 		_weightsType = wt;
 		_cacheU = cacheU;
 		_cacheV = cacheV;
@@ -89,18 +91,19 @@ public class WeightedDivMMR extends Lop
 	
 	/* MR instruction generation */
 	@Override 
-	public String getInstructions(int input1, int input2, int input3, int output)
+	public String getInstructions(int input1, int input2, int input3, int input4, int output)
 	{
 		return getInstructions(
 				String.valueOf(input1), 
 				String.valueOf(input2), 
-				String.valueOf(input3), 
+				String.valueOf(input3),
+				String.valueOf(input4),
 				String.valueOf(output));
 	}
 	
 	/* CP/SPARK instruction generation */
 	@Override
-	public String getInstructions(String input1, String input2, String input3, String output)
+	public String getInstructions(String input1, String input2, String input3, String input4, String output)
 	{
 		StringBuilder sb = new StringBuilder();
 		
@@ -119,6 +122,9 @@ public class WeightedDivMMR extends Lop
 		sb.append( getInputs().get(2).prepInputOperand(input3));
 		
 		sb.append(Lop.OPERAND_DELIMITOR);
+		sb.append( getInputs().get(3).prepInputOperand(input4));
+		
+		sb.append(Lop.OPERAND_DELIMITOR);
 		sb.append( prepOutputOperand(output));
 		
 		sb.append(Lop.OPERAND_DELIMITOR);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/instructions/cp/QuaternaryCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/QuaternaryCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/QuaternaryCPInstruction.java
index 29b132b..535684c 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/QuaternaryCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/QuaternaryCPInstruction.java
@@ -64,7 +64,7 @@ public class QuaternaryCPInstruction extends ComputationCPInstruction
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(inst);
 		String opcode = parts[0];
 		
-		if( opcode.equalsIgnoreCase("wsloss") ) 
+		if( opcode.equalsIgnoreCase("wsloss") || opcode.equalsIgnoreCase("wdivmm") ) 
 		{
 			InstructionUtils.checkNumFields ( parts, 7 );
 			
@@ -73,13 +73,14 @@ public class QuaternaryCPInstruction extends ComputationCPInstruction
 			CPOperand in3 = new CPOperand(parts[3]);
 			CPOperand in4 = new CPOperand(parts[4]);
 			CPOperand out = new CPOperand(parts[5]);
-			
-			WeightsType wtype = WeightsType.valueOf(parts[6]);
 			int k = Integer.parseInt(parts[7]);
 			
-			return new QuaternaryCPInstruction(new QuaternaryOperator(wtype), in1, in2, in3, in4, out, k, opcode, inst);	
+			if( opcode.equalsIgnoreCase("wsloss") )
+				return new QuaternaryCPInstruction(new QuaternaryOperator(WeightsType.valueOf(parts[6])), in1, in2, in3, in4, out, k, opcode, inst);	
+			else if( opcode.equalsIgnoreCase("wdivmm") )
+				return new QuaternaryCPInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[6])), in1, in2, in3, in4, out, k, opcode, inst);				
 		}
-		else if( opcode.equalsIgnoreCase("wsigmoid") || opcode.equalsIgnoreCase("wdivmm") || opcode.equalsIgnoreCase("wcemm") )
+		else if( opcode.equalsIgnoreCase("wsigmoid") || opcode.equalsIgnoreCase("wcemm") )
 		{
 			InstructionUtils.checkNumFields ( parts, 6 );
 			
@@ -91,8 +92,6 @@ public class QuaternaryCPInstruction extends ComputationCPInstruction
 			
 			if( opcode.equalsIgnoreCase("wsigmoid") )
 				return new QuaternaryCPInstruction(new QuaternaryOperator(WSigmoidType.valueOf(parts[5])), in1, in2, in3, null, out, k, opcode, inst);
-			else if( opcode.equalsIgnoreCase("wdivmm") )
-				return new QuaternaryCPInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[5])), in1, in2, in3, null, out, k, opcode, inst);
 			else if( opcode.equalsIgnoreCase("wcemm") ) 		
 				return new QuaternaryCPInstruction(new QuaternaryOperator(WCeMMType.valueOf(parts[5])), in1, in2, in3, null, out, k, opcode, inst);
 		}
@@ -124,7 +123,7 @@ public class QuaternaryCPInstruction extends ComputationCPInstruction
 		MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName());
 		MatrixBlock matBlock3 = ec.getMatrixInput(input3.getName());
 		MatrixBlock matBlock4 = null;
-		if( qop.wtype1 != null && qop.wtype1.hasFourInputs() ) {
+		if( qop.hasFourInputs() ) {
 			matBlock4 = ec.getMatrixInput(input4.getName());
 		}
 		
@@ -141,6 +140,8 @@ public class QuaternaryCPInstruction extends ComputationCPInstruction
 			ec.setVariable(output.getName(), new DoubleObject(out.getValue(0, 0)));
 		}
 		else { //wsigmoid / wdivmm / wumm
+			if( qop.wtype3 != null && qop.wtype3.hasFourInputs() )
+				ec.releaseMatrixInput(input4.getName());
 			ec.setMatrixOutput(output.getName(), (MatrixBlock)out);
 		}
 	}	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/instructions/mr/QuaternaryInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/mr/QuaternaryInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/mr/QuaternaryInstruction.java
index 9d5782a..58aa63d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/mr/QuaternaryInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/mr/QuaternaryInstruction.java
@@ -22,6 +22,7 @@ package org.apache.sysml.runtime.instructions.mr;
 import java.util.ArrayList;
 
 import org.apache.sysml.lops.WeightedCrossEntropy.WCeMMType;
+import org.apache.sysml.lops.WeightedDivMM;
 import org.apache.sysml.lops.WeightedDivMM.WDivMMType;
 import org.apache.sysml.lops.WeightedDivMMR;
 import org.apache.sysml.lops.WeightedSigmoid.WSigmoidType;
@@ -200,36 +201,56 @@ public class QuaternaryInstruction extends MRInstruction implements IDistributed
 			
 			return new QuaternaryInstruction(new QuaternaryOperator(wtype,uopcode), in1, in2, in3, (byte)-1, out, cacheU, cacheV, str);	
 		}
-		else //wsigmoid / wdivmm / wcemm
+		else if(    WeightedDivMM.OPCODE.equalsIgnoreCase(opcode)  //wdivmm
+				|| WeightedDivMMR.OPCODE.equalsIgnoreCase(opcode) )
 		{
 			boolean isRed = opcode.startsWith("red");
 			
 			//check number of fields (3 inputs, output, type)
-			if( WeightedDivMMR.OPCODE.equalsIgnoreCase(opcode) )
-				InstructionUtils.checkNumFields ( str, 7, 8 );
-			else if( isRed )
+			if( isRed )
+				InstructionUtils.checkNumFields ( str, 8 );
+			else
+				InstructionUtils.checkNumFields ( str, 6 );
+				
+			//parse instruction parts (without exec type)
+			String[] parts = InstructionUtils.getInstructionParts(str);
+			
+			byte in1 = Byte.parseByte(parts[1]);
+			byte in2 = Byte.parseByte(parts[2]);
+			byte in3 = Byte.parseByte(parts[3]);
+			byte in4 = Byte.parseByte(parts[4]);
+			byte out = Byte.parseByte(parts[5]);
+			
+			//in mappers always through distcache, in reducers through distcache/shuffle
+			boolean cacheU = isRed ? Boolean.parseBoolean(parts[7]) : true;
+			boolean cacheV = isRed ? Boolean.parseBoolean(parts[8]) : true;
+			
+			return new QuaternaryInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[6])), in1, in2, in3, in4, out, cacheU, cacheV, str);
+		}
+		else //wsigmoid / wcemm
+		{
+			boolean isRed = opcode.startsWith("red");
+			
+			//check number of fields (3 inputs, output, type)
+			if( isRed )
 				InstructionUtils.checkNumFields ( str, 7 );
 			else
 				InstructionUtils.checkNumFields ( str, 5 );
 				
 			//parse instruction parts (without exec type)
 			String[] parts = InstructionUtils.getInstructionParts(str);
-			boolean wdivmmMinus = (parts.length==9);
 			
 			byte in1 = Byte.parseByte(parts[1]);
 			byte in2 = Byte.parseByte(parts[2]);
 			byte in3 = Byte.parseByte(parts[3]);
-			byte in4 = wdivmmMinus?Byte.parseByte(parts[4]):-1;
-			byte out = Byte.parseByte(parts[wdivmmMinus?5:4]);
+			byte out = Byte.parseByte(parts[4]);
 			
 			//in mappers always through distcache, in reducers through distcache/shuffle
-			boolean cacheU = isRed ? Boolean.parseBoolean(parts[wdivmmMinus?7:6]) : true;
-			boolean cacheV = isRed ? Boolean.parseBoolean(parts[wdivmmMinus?8:7]) : true;
+			boolean cacheU = isRed ? Boolean.parseBoolean(parts[6]) : true;
+			boolean cacheV = isRed ? Boolean.parseBoolean(parts[7]) : true;
 			
 			if( opcode.endsWith("wsigmoid") )
 				return new QuaternaryInstruction(new QuaternaryOperator(WSigmoidType.valueOf(parts[5])), in1, in2, in3, (byte)-1, out, cacheU, cacheV, str);
-			else if( opcode.endsWith("wdivmm") )
-				return new QuaternaryInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[wdivmmMinus?6:5])), in1, in2, in3, in4, out, cacheU, cacheV, str);
 			else if( opcode.endsWith("wcemm") )
 				return new QuaternaryInstruction(new QuaternaryOperator(WCeMMType.valueOf(parts[5])), in1, in2, in3, (byte)-1, out, cacheU, cacheV, str);
 		}
@@ -265,20 +286,20 @@ public class QuaternaryInstruction extends MRInstruction implements IDistributed
 	public byte[] getInputIndexes() 
 	{
 		QuaternaryOperator qop = (QuaternaryOperator)optr;
-		if( qop.wtype1 == null || !qop.wtype1.hasFourInputs() )
-			return new byte[]{_input1, _input2, _input3};
+		if( qop.hasFourInputs() )
+			return new byte[]{_input1, _input2, _input3, _input4};		
 		else
-			return new byte[]{_input1, _input2, _input3, _input4};
+			return new byte[]{_input1, _input2, _input3};
 	}
 
 	@Override
 	public byte[] getAllIndexes() 
 	{
 		QuaternaryOperator qop = (QuaternaryOperator)optr;
-		if( qop.wtype1 == null || !qop.wtype1.hasFourInputs() )
-			return new byte[]{_input1, _input2, _input3, output};
-		else
+		if( qop.hasFourInputs() )
 			return new byte[]{_input1, _input2, _input3, _input4, output};		
+		else
+			return new byte[]{_input1, _input2, _input3, output};
 	}
 
 	@Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuaternarySPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuaternarySPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuaternarySPInstruction.java
index 500cc01..3792743 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuaternarySPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuaternarySPInstruction.java
@@ -26,7 +26,6 @@ import java.util.Iterator;
 import java.util.LinkedList;
 
 import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 
@@ -35,6 +34,7 @@ import scala.Tuple2;
 import org.apache.sysml.lops.WeightedCrossEntropy;
 import org.apache.sysml.lops.WeightedDivMM;
 import org.apache.sysml.lops.WeightedDivMM.WDivMMType;
+import org.apache.sysml.lops.WeightedDivMMR;
 import org.apache.sysml.lops.WeightedSigmoid;
 import org.apache.sysml.lops.WeightedSquaredLoss;
 import org.apache.sysml.lops.WeightedSquaredLossR;
@@ -147,7 +147,30 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 			
 			return new QuaternarySPInstruction(new QuaternaryOperator(wtype, uopcode), in1, in2, in3, null, out, cacheU, cacheV, opcode, str);	
 		}
-		else //map/redwsigmoid, map/redwdivmm, map/redwcemm
+		else if(    WeightedDivMM.OPCODE.equalsIgnoreCase(opcode)    //wdivmm
+				|| WeightedDivMMR.OPCODE.equalsIgnoreCase(opcode) )
+		{
+			boolean isRed = opcode.startsWith("red");
+			
+			//check number of fields (3 inputs, output, type)
+			if( isRed )
+				InstructionUtils.checkNumFields( parts, 8 );
+			else
+				InstructionUtils.checkNumFields( parts, 6 );
+			
+			CPOperand in1 = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = new CPOperand(parts[3]);
+			CPOperand in4 = new CPOperand(parts[4]);
+			CPOperand out = new CPOperand(parts[5]);
+			
+			//in mappers always through distcache, in reducers through distcache/shuffle
+			boolean cacheU = isRed ? Boolean.parseBoolean(parts[7]) : true;
+			boolean cacheV = isRed ? Boolean.parseBoolean(parts[8]) : true;
+		
+			return new QuaternarySPInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[6])), in1, in2, in3, in4, out, cacheU, cacheV, opcode, str);
+		} 
+		else //map/redwsigmoid, map/redwcemm
 		{
 			boolean isRed = opcode.startsWith("red");
 			
@@ -168,8 +191,6 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 		
 			if( opcode.endsWith("wsigmoid") )
 				return new QuaternarySPInstruction(new QuaternaryOperator(WSigmoidType.valueOf(parts[5])), in1, in2, in3, null, out, cacheU, cacheV, opcode, str);
-			else if( opcode.endsWith("wdivmm") )
-				return new QuaternarySPInstruction(new QuaternaryOperator(WDivMMType.valueOf(parts[5])), in1, in2, in3, null, out, cacheU, cacheV, opcode, str);
 			else if( opcode.endsWith("wcemm") )
 				return new QuaternarySPInstruction(new QuaternaryOperator(WCeMMType.valueOf(parts[5])), in1, in2, in3, null, out, cacheU, cacheV, opcode, str);
 		}
@@ -228,7 +249,7 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 			PartitionedBroadcastMatrix bc2 = _cacheV ? sec.getBroadcastForVariable( input3.getName() ) : null;
 			JavaPairRDD<MatrixIndexes,MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable( input2.getName() ) : null;
 			JavaPairRDD<MatrixIndexes,MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable( input3.getName() ) : null;
-			JavaPairRDD<MatrixIndexes,MatrixBlock> inW = (qop.wtype1!=null && qop.wtype1.hasFourInputs()) ? 
+			JavaPairRDD<MatrixIndexes,MatrixBlock> inW = qop.hasFourInputs() ? 
 					sec.getBinaryBlockRDDHandleForVariable( _input4.getName() ) : null;
 
 			//preparation of transposed and replicated U
@@ -261,9 +282,9 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 				out = in.join(inV).join(inW)
 				        .mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
 			//function call w/ four rdd inputs
-			else 
+			else //need keys in case of wdivmm 
 				out = in.join(inU).join(inV).join(inW)
-				        .mapValues(new RDDQuaternaryFunction4(qop));
+				        .mapToPair(new RDDQuaternaryFunction4(qop));
 			
 			//keep variable names for lineage maintenance
 			if( inU == null ) bcVars.add(input2.getName()); else rddVars.add(input2.getName());
@@ -424,13 +445,13 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 			
 			MatrixBlock mbU = (_pmU!=null)?_pmU.getMatrixBlock((int)ixIn.getRowIndex(), 1) : blkIn2;
 			MatrixBlock mbV = (_pmV!=null)?_pmV.getMatrixBlock((int)ixIn.getColumnIndex(), 1) : blkIn2;
-			MatrixBlock mbW = (_qop.wtype1!=null && _qop.wtype1.hasFourInputs()) ? blkIn2 : null;
+			MatrixBlock mbW = (_qop.hasFourInputs()) ? blkIn2 : null;
 			
 			//execute core operation
 			blkIn1.quaternaryOperations(_qop, mbU, mbV, mbW, blkOut);
 			
 			//create return tuple
-			MatrixIndexes ixOut = createOutputIndexes(ixIn);
+			MatrixIndexes ixOut = createOutputIndexes(ixIn);			
 			return new Tuple2<MatrixIndexes,MatrixBlock>(ixOut, blkOut);
 		}
 	}
@@ -463,7 +484,7 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 			MatrixBlock mbU = (_pmU!=null)?_pmU.getMatrixBlock((int)ixIn.getRowIndex(), 1) : blkIn2;
 			MatrixBlock mbV = (_pmV!=null)?_pmV.getMatrixBlock((int)ixIn.getColumnIndex(), 1) : 
 				              (_pmU!=null)? blkIn2 : blkIn3;
-			MatrixBlock mbW = (_qop.wtype1!=null && _qop.wtype1.hasFourInputs())? blkIn3 : null;
+			MatrixBlock mbW = (_qop.hasFourInputs())? blkIn3 : null;
 			
 			//execute core operation
 			blkIn1.quaternaryOperations(_qop, mbU, mbV, mbW, blkOut);
@@ -478,31 +499,33 @@ public class QuaternarySPInstruction extends ComputationSPInstruction
 	 * Note: never called for wsigmoid/wdivmm (only wsloss)
 	 */
 	private static class RDDQuaternaryFunction4 extends RDDQuaternaryBaseFunction //four rdd input
-		implements Function<Tuple2<Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock>,MatrixBlock>,MatrixBlock>
+		implements PairFunction<Tuple2<MatrixIndexes,Tuple2<Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock>,MatrixBlock>>,MatrixIndexes,MatrixBlock>
 	{
 		private static final long serialVersionUID = 7328911771600289250L;
 		
 		public RDDQuaternaryFunction4( QuaternaryOperator qop ) 
 			throws DMLRuntimeException, DMLUnsupportedOperationException
-		{
+		{ 
 			super(qop, null, null);		
 		}
-	
+
 		@Override
-		public MatrixBlock call(Tuple2<Tuple2<Tuple2<MatrixBlock, MatrixBlock>, MatrixBlock>, MatrixBlock> arg0)
+		public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, Tuple2<Tuple2<Tuple2<MatrixBlock, MatrixBlock>, MatrixBlock>, MatrixBlock>> arg0)
 			throws Exception 
 		{
-			MatrixBlock blkIn1 = arg0._1()._1()._1();
-			MatrixBlock mbU = arg0._1()._1()._2();
-			MatrixBlock mbV = arg0._1()._2();
-			MatrixBlock mbW = arg0._2();
+			MatrixIndexes ixIn1 = arg0._1();
+			MatrixBlock blkIn1 = arg0._2()._1()._1()._1();
+			MatrixBlock mbU = arg0._2()._1()._1()._2();
+			MatrixBlock mbV = arg0._2()._1()._2();
+			MatrixBlock mbW = arg0._2()._2();
 			MatrixBlock blkOut = new MatrixBlock();
 			
 			//execute core operation
 			blkIn1.quaternaryOperations(_qop, mbU, mbV, mbW, blkOut);
 			
 			//create return tuple
-			return blkOut;
+			MatrixIndexes ixOut = createOutputIndexes(ixIn1);
+			return new Tuple2<MatrixIndexes,MatrixBlock>(ixOut, blkOut);
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 7e6ec48..78ce8f0 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -708,7 +708,7 @@ public class LibMatrixMult
 	 * @param wt
 	 * @throws DMLRuntimeException
 	 */
-	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt) 
+	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt) 
 		throws DMLRuntimeException 
 	{
 		//check for empty result 
@@ -727,12 +727,12 @@ public class LibMatrixMult
 		ret.allocateDenseOrSparseBlock();
 		
 		//core weighted div mm computation
-		if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
-			matrixMultWDivMMDense(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
-		else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
-			matrixMultWDivMMSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
+		if( !mW.sparse && !mU.sparse && !mV.sparse && (mX==null || !mX.sparse) && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
+			matrixMultWDivMMDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
+		else if( mW.sparse && !mU.sparse && !mV.sparse && (mX==null || mX.sparse) && !mU.isEmptyBlock() && !mV.isEmptyBlock())
+			matrixMultWDivMMSparseDense(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
 		else
-			matrixMultWDivMMGeneric(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
+			matrixMultWDivMMGeneric(mW, mU, mV, mX, ret, wt, 0, mW.rlen, 0, mW.clen);
 		
 		//post-processing
 		ret.recomputeNonZeros();
@@ -757,7 +757,7 @@ public class LibMatrixMult
 	 * @param k
 	 * @throws DMLRuntimeException
 	 */
-	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int k) 
+	public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int k) 
 		throws DMLRuntimeException 
 	{
 		//check for empty result 
@@ -784,12 +784,12 @@ public class LibMatrixMult
 			if( wt.isLeft() ) {
 				int blklen = (int)(Math.ceil((double)mW.clen/k));
 				for( int j=0; j<k & j*blklen<mW.clen; j++ )
-					tasks.add(new MatrixMultWDivTask(mW, mU, mV, ret, wt, 0, mW.rlen, j*blklen, Math.min((j+1)*blklen, mW.clen)));
+					tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, 0, mW.rlen, j*blklen, Math.min((j+1)*blklen, mW.clen)));
 			}
 			else { //basic/right
 				int blklen = (int)(Math.ceil((double)mW.rlen/k));
 				for( int i=0; i<k & i*blklen<mW.rlen; i++ )
-					tasks.add(new MatrixMultWDivTask(mW, mU, mV, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen), 0, mW.clen));
+					tasks.add(new MatrixMultWDivTask(mW, mU, mV, mX, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen), 0, mW.clen));
 			}
 			//execute tasks
 			pool.invokeAll(tasks);
@@ -2557,19 +2557,21 @@ public class LibMatrixMult
 	 * @param ru
 	 * @throws DMLRuntimeException
 	 */
-	private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
+	private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException 
 	{	
 		final boolean basic = wt.isBasic();
 		final boolean left = wt.isLeft();
 		final boolean mult = wt.isMult();
 		final boolean minus = wt.isMinus();
+		final boolean four = wt.hasFourInputs();
 		final int n = mW.clen;
 		final int cd = mU.clen;
 		
 		double[] w = mW.denseBlock;
 		double[] u = mU.denseBlock;
 		double[] v = mV.denseBlock;
+		double[] x = (mX==null) ? null : mX.denseBlock;
 		double[] c = ret.denseBlock;
 		
 		//approach: iterate over non-zeros of w, selective mm computation
@@ -2590,6 +2592,8 @@ public class LibMatrixMult
 						if( w[ix+j] != 0 ) {
 							if( basic ) 
 								c[ix+j] = w[ix+j] * dotProduct(u, v, uix, vix, cd);	
+							else if( four ) //left/right 
+								wdivmm(w[ix+j], x[ix+j], u, v, c, uix, vix, left, cd);
 							else //left/right minus/default
 								wdivmm(w[ix+j], u, v, c, uix, vix, left, mult, minus, cd);
 						}
@@ -2608,19 +2612,21 @@ public class LibMatrixMult
 	 * @param ru
 	 * @throws DMLRuntimeException 
 	 */
-	private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
+	private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException
 	{
 		final boolean basic = wt.isBasic();
 		final boolean left = wt.isLeft();
 		final boolean mult = wt.isMult();
 		final boolean minus = wt.isMinus();
+		final boolean four = wt.hasFourInputs();
 		final int cd = mU.clen;
 		
 		SparseBlock w = mW.sparseBlock;
 		double[] u = mU.denseBlock;
 		double[] v = mV.denseBlock;
 		double[] c = ret.denseBlock;
+		SparseBlock x = (mX==null) ? null : mX.sparseBlock;
 		
 		//approach: iterate over non-zeros of w, selective mm computation
 		for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd ) {
@@ -2634,6 +2640,15 @@ public class LibMatrixMult
 					for( int k=wpos; k<wpos+wlen; k++ )
 						ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
 				}
+				else if( four ) { //left/right
+					//TODO perf: check for aligment and avoid binary search on X
+					int k = (cl==0) ? wpos : w.posFIndexGTE(i,cl);
+					k = (k>=0) ? k : wpos+wlen;
+					for( ; k<wpos+wlen && wix[k]<cu; k++ ) {
+						double xij = x.get(i, wix[k]);
+						wdivmm(wval[k], xij, u, v, c, uix, wix[k]*cd, left, cd);
+					}
+				}
 				else { //left/right minus default
 					int k = (cl==0) ? wpos : w.posFIndexGTE(i,cl);
 					k = (k>=0) ? k : wpos+wlen;
@@ -2656,13 +2671,14 @@ public class LibMatrixMult
 	 * @param ru
 	 * @throws DMLRuntimeException 
 	 */
-	private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
+	private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
 		throws DMLRuntimeException
 	{
 		final boolean basic = wt.isBasic();
 		final boolean left = wt.isLeft(); 
 		final boolean mult = wt.isMult();
 		final boolean minus = wt.isMinus();
+		final boolean four = wt.hasFourInputs();
 		final int n = mW.clen; 
 		final int cd = mU.clen;
 
@@ -2687,6 +2703,10 @@ public class LibMatrixMult
 							double uvij = dotProductGeneric(mU,mV, i, wix[k], cd);
 							ret.appendValue(i, wix[k], uvij);
 						}
+						else if( four ) { //left/right
+							double xij = mX.quickGetValue(i, wix[k]);
+							wdivmm(wval[k], xij, mU, mV, c, i, wix[k], left, cd);
+						}
 						else { //left/right minus/default
 							wdivmm(wval[k], mU, mV, c, i, wix[k], left, mult, minus, cd);
 						}
@@ -2704,6 +2724,10 @@ public class LibMatrixMult
 						if( basic ) {
 							c[ix+j] = dotProductGeneric(mU,mV, i, j, cd);
 						}
+						else if( four ) { //left/right
+							double xij = mX.quickGetValue(i, j);
+							wdivmm(w[ix+j], xij, mU, mV, c, i, j, left, cd);
+						}
 						else { //left/right minus/default
 							wdivmm(w[ix+j], mU, mV, c, i, j, left, mult, minus, cd);
 						}
@@ -3598,6 +3622,35 @@ public class LibMatrixMult
 		//compute final mm output
 		vectMultiplyAdd(tmpval, b, c, bix, cix, len);
 	}
+	
+	/**
+	 * 
+	 * @param wij
+	 * @param xij
+	 * @param u
+	 * @param v
+	 * @param c
+	 * @param uix
+	 * @param vix
+	 * @param left
+	 * @param len
+	 */
+	private static void wdivmm( final double wij, final double xij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final int len )
+	{
+		//compute dot product over ui vj 
+		double uvij = dotProduct(u, v, uix, vix, len);
+		
+		//compute core wdivmm  
+		double tmpval = wij * (uvij - xij);
+		
+		//prepare inputs for final mm
+		int bix = left ? uix : vix;
+		int cix = left ? vix : uix;
+		double[] b = left ? u : v;		
+		
+		//compute final mm output
+		vectMultiplyAdd(tmpval, b, c, bix, cix, len);
+	}
 
 
 	/**
@@ -3633,6 +3686,36 @@ public class LibMatrixMult
 	/**
 	 * 
 	 * @param wij
+	 * @param xij
+	 * @param u
+	 * @param v
+	 * @param c
+	 * @param uix
+	 * @param vix
+	 * @param left
+	 * @param len
+	 */
+	private static void wdivmm( final double wij, final double xij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, final int len )
+	{
+		//compute dot product over ui vj 
+		double uvij = dotProductGeneric(u, v, uix, vix, len);
+		
+		//compute core wdivmm
+		double wtmp = wij * (uvij - xij);
+		
+		//prepare inputs for final mm
+		int bix = left ? uix : vix;
+		int cix = left ? vix*len : uix*len;
+		MatrixBlock b = left ? u : v;		
+		
+		//compute final mm
+		for( int k2=0; k2<len; k2++ )
+			c[cix+k2] += b.quickGetValue(bix, k2) * wtmp;
+	}
+	
+	/**
+	 * 
+	 * @param wij
 	 * @param u
 	 * @param v
 	 * @param uix
@@ -4188,6 +4271,7 @@ public class LibMatrixMult
 		private MatrixBlock _mW = null;
 		private MatrixBlock _mU = null;
 		private MatrixBlock _mV = null;
+		private MatrixBlock _mX = null;
 		private MatrixBlock _ret = null;
 		private WDivMMType _wt = null;
 		private int _rl = -1;
@@ -4196,12 +4280,13 @@ public class LibMatrixMult
 		private int _cu = -1;
 		private long _nnz = -1;
 		
-		protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
+		protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock mX, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu) 
 			throws DMLRuntimeException
 		{
 			_mW = mW;
 			_mU = mU;
 			_mV = mV;
+			_mX = mX;
 			_wt = wt;
 			_rl = rl;
 			_ru = ru;
@@ -4214,12 +4299,12 @@ public class LibMatrixMult
 		public Object call() throws DMLRuntimeException
 		{
 			//core weighted div mm computation
-			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
-				matrixMultWDivMMDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
-			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
-				matrixMultWDivMMSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
+			if( !_mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || !_mX.sparse) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
+				matrixMultWDivMMDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
+			else if( _mW.sparse && !_mU.sparse && !_mV.sparse && (_mX==null || _mX.sparse) && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
+				matrixMultWDivMMSparseDense(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
 			else
-				matrixMultWDivMMGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
+				matrixMultWDivMMGeneric(_mW, _mU, _mV, _mX, _ret, _wt, _rl, _ru, _cl, _cu);
 		
 			//maintain partial nnz for right (upper bounds inclusive)
 			int rl = _wt.isLeft() ? _cl : _rl;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 23fdb85..8f47bc3 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -5791,10 +5791,11 @@ public class MatrixBlock extends MatrixValue implements Externalizable
 		}	
 		else if( qop.wtype3 != null ){ //wdivmm
 			//note: for wdivmm-minus X and W interchanged because W always present 
+			MatrixBlock W = qop.wtype3.hasFourInputs() ? checkType(wm) : null;
 			if( k > 1 )
-				LibMatrixMult.matrixMultWDivMM(X, U, V, R, qop.wtype3, k);
+				LibMatrixMult.matrixMultWDivMM(X, U, V, W, R, qop.wtype3, k);
 			else
-				LibMatrixMult.matrixMultWDivMM(X, U, V, R, qop.wtype3);	
+				LibMatrixMult.matrixMultWDivMM(X, U, V, W, R, qop.wtype3);	
 		}
 		else if( qop.wtype4 != null ){ //wcemm
 			if( k > 1 )

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/main/java/org/apache/sysml/runtime/matrix/operators/QuaternaryOperator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/operators/QuaternaryOperator.java b/src/main/java/org/apache/sysml/runtime/matrix/operators/QuaternaryOperator.java
index 5c1b6a2..78ec7af 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/operators/QuaternaryOperator.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/operators/QuaternaryOperator.java
@@ -96,4 +96,13 @@ public class QuaternaryOperator extends Operator
 		else
 			fn = Builtin.getBuiltinFnObject(op);
 	}
+	
+	/**
+	 * 
+	 * @return
+	 */
+	public boolean hasFourInputs() {
+		return (wtype1 != null && wtype1.hasFourInputs())
+			|| (wtype3 != null && wtype3.hasFourInputs());
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/test/java/org/apache/sysml/test/integration/functions/quaternary/WeightedDivMatrixMultTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/quaternary/WeightedDivMatrixMultTest.java b/src/test/java/org/apache/sysml/test/integration/functions/quaternary/WeightedDivMatrixMultTest.java
index 1239699..3704678 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/quaternary/WeightedDivMatrixMultTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/quaternary/WeightedDivMatrixMultTest.java
@@ -54,6 +54,8 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 	private final static String TEST_NAME5 = "WeightedDivMMMultRight";
 	private final static String TEST_NAME6 = "WeightedDivMMMultMinusLeft";
 	private final static String TEST_NAME7 = "WeightedDivMMMultMinusRight";
+	private final static String TEST_NAME8 = "WeightedDivMM4MultMinusLeft";
+	private final static String TEST_NAME9 = "WeightedDivMM4MultMinusRight";
 	private final static String TEST_DIR = "functions/quaternary/";
 	private final static String TEST_CLASS_DIR = TEST_DIR + WeightedDivMatrixMultTest.class.getSimpleName() + "/";
 	
@@ -76,6 +78,8 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 		addTestConfiguration(TEST_NAME5,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME5,new String[]{"R"}));
 		addTestConfiguration(TEST_NAME6,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME6,new String[]{"R"}));
 		addTestConfiguration(TEST_NAME7,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME7,new String[]{"R"}));
+		addTestConfiguration(TEST_NAME8,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME8,new String[]{"R"}));
+		addTestConfiguration(TEST_NAME9,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME9,new String[]{"R"}));
 	
 		if (TEST_CACHE_ENABLED) {
 			setOutAndExpectedDeletionDisabled(true);
@@ -99,334 +103,329 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 	//a) testcases for wdivmm w/ DIVIDE LEFT/RIGHT
 	
 	@Test
-	public void testWeightedDivMMLeftDenseCP() 
-	{
+	public void testWeightedDivMMLeftDenseCP() {
 		runWeightedDivMMTest(TEST_NAME1, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftSparseCP() 
-	{
+	public void testWeightedDivMMLeftSparseCP() {
 		runWeightedDivMMTest(TEST_NAME1, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightDenseCP() 
-	{
+	public void testWeightedDivMMRightDenseCP() {
 		runWeightedDivMMTest(TEST_NAME2, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightSparseCP() 
-	{
+	public void testWeightedDivMMRightSparseCP() {
 		runWeightedDivMMTest(TEST_NAME2, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftDenseMR() 
-	{
+	public void testWeightedDivMMLeftDenseMR() {
 		runWeightedDivMMTest(TEST_NAME1, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftSparseMR() 
-	{
+	public void testWeightedDivMMLeftSparseMR() {
 		runWeightedDivMMTest(TEST_NAME1, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftDenseMRRep() 
-	{
+	public void testWeightedDivMMLeftDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME1, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightDenseMR() 
-	{
+	public void testWeightedDivMMRightDenseMR() {
 		runWeightedDivMMTest(TEST_NAME2, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightSparseMR() 
-	{
+	public void testWeightedDivMMRightSparseMR() {
 		runWeightedDivMMTest(TEST_NAME2, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightDenseMRRep() 
-	{
+	public void testWeightedDivMMRightDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME2, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftDenseSP() 
-	{
+	public void testWeightedDivMMLeftDenseSP() {
 		runWeightedDivMMTest(TEST_NAME1, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftSparseSP() 
-	{
+	public void testWeightedDivMMLeftSparseSP() {
 		runWeightedDivMMTest(TEST_NAME1, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMLeftDenseSPRep() 
-	{
+	public void testWeightedDivMMLeftDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME1, false, true, true, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightDenseSP() 
-	{
+	public void testWeightedDivMMRightDenseSP() {
 		runWeightedDivMMTest(TEST_NAME2, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightSparseSP() 
-	{
+	public void testWeightedDivMMRightSparseSP() {
 		runWeightedDivMMTest(TEST_NAME2, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMRightDenseSPRep() 
-	{
+	public void testWeightedDivMMRightDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME2, false, true, true, ExecType.SPARK);
 	}
 
 	//b) testcases for wdivmm w/ MULTIPLY BASIC/LEFT/RIGHT
 	
 	@Test
-	public void testWeightedDivMMMultBasicDenseCP() 
-	{
+	public void testWeightedDivMMMultBasicDenseCP() {
 		runWeightedDivMMTest(TEST_NAME3, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicSparseCP() 
-	{
+	public void testWeightedDivMMMultBasicSparseCP() {
 		runWeightedDivMMTest(TEST_NAME3, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftDenseCP() 
-	{
+	public void testWeightedDivMMMultLeftDenseCP() {
 		runWeightedDivMMTest(TEST_NAME4, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftSparseCP() 
-	{
+	public void testWeightedDivMMMultLeftSparseCP() {
 		runWeightedDivMMTest(TEST_NAME4, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightDenseCP() 
-	{
+	public void testWeightedDivMMMultRightDenseCP() {
 		runWeightedDivMMTest(TEST_NAME5, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightSparseCP() 
-	{
+	public void testWeightedDivMMMultRightSparseCP() {
 		runWeightedDivMMTest(TEST_NAME5, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftDenseCP() 
-	{
+	public void testWeightedDivMMMultMinusLeftDenseCP() {
 		runWeightedDivMMTest(TEST_NAME6, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftSparseCP() 
-	{
+	public void testWeightedDivMMMultMinusLeftSparseCP() {
 		runWeightedDivMMTest(TEST_NAME6, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightDenseCP() 
-	{
+	public void testWeightedDivMMMultMinusRightDenseCP() {
 		runWeightedDivMMTest(TEST_NAME7, false, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightSparseCP() 
-	{
+	public void testWeightedDivMMMultMinusRightSparseCP() {
 		runWeightedDivMMTest(TEST_NAME7, true, true, false, ExecType.CP);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicDenseMR() 
-	{
+	public void testWeightedDivMM4MultMinusLeftDenseCP() {
+		runWeightedDivMMTest(TEST_NAME8, false, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusLeftSparseCP() {
+		runWeightedDivMMTest(TEST_NAME8, true, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightDenseCP() {
+		runWeightedDivMMTest(TEST_NAME9, false, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightSparseCP() {
+		runWeightedDivMMTest(TEST_NAME9, true, true, false, ExecType.CP);
+	}
+	
+	@Test
+	public void testWeightedDivMMMultBasicDenseMR() {
 		runWeightedDivMMTest(TEST_NAME3, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicSparseMR() 
-	{
+	public void testWeightedDivMMMultBasicSparseMR() {
 		runWeightedDivMMTest(TEST_NAME3, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicDenseMRRep() 
-	{
+	public void testWeightedDivMMMultBasicDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME3, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftDenseMR() 
-	{
+	public void testWeightedDivMMMultLeftDenseMR() {
 		runWeightedDivMMTest(TEST_NAME4, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftSparseMR() 
-	{
+	public void testWeightedDivMMMultLeftSparseMR() {
 		runWeightedDivMMTest(TEST_NAME4, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftDenseMRRep() 
-	{
+	public void testWeightedDivMMMultLeftDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME4, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightDenseMR() 
-	{
+	public void testWeightedDivMMMultRightDenseMR() {
 		runWeightedDivMMTest(TEST_NAME5, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightSparseMR() 
-	{
+	public void testWeightedDivMMMultRightSparseMR() {
 		runWeightedDivMMTest(TEST_NAME5, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightDenseMRRep() 
-	{
+	public void testWeightedDivMMMultRightDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME5, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftDenseMR() 
-	{
+	public void testWeightedDivMMMultMinusLeftDenseMR() {
 		runWeightedDivMMTest(TEST_NAME6, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftSparseMR() 
-	{
+	public void testWeightedDivMMMultMinusLeftSparseMR() {
 		runWeightedDivMMTest(TEST_NAME6, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftDenseMRRep() 
-	{
+	public void testWeightedDivMMMultMinusLeftDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME6, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightDenseMR() 
-	{
+	public void testWeightedDivMMMultMinusRightDenseMR() {
 		runWeightedDivMMTest(TEST_NAME7, false, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightSparseMR() 
-	{
+	public void testWeightedDivMMMultMinusRightSparseMR() {
 		runWeightedDivMMTest(TEST_NAME7, true, true, false, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightDenseMRRep() 
-	{
+	public void testWeightedDivMMMultMinusRightDenseMRRep() {
 		runWeightedDivMMTest(TEST_NAME7, false, true, true, ExecType.MR);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicDenseSP() 
-	{
+	public void testWeightedDivMM4MultMinusLeftDenseMR() {
+		runWeightedDivMMTest(TEST_NAME8, false, true, false, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusLeftSparseMR() {
+		runWeightedDivMMTest(TEST_NAME8, true, true, false, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusLeftDenseMRRep() {
+		runWeightedDivMMTest(TEST_NAME8, false, true, true, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightDenseMR() {
+		runWeightedDivMMTest(TEST_NAME9, false, true, false, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightSparseMR() {
+		runWeightedDivMMTest(TEST_NAME9, true, true, false, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightDenseMRRep() {
+		runWeightedDivMMTest(TEST_NAME9, false, true, true, ExecType.MR);
+	}
+	
+	@Test
+	public void testWeightedDivMMMultBasicDenseSP() {
 		runWeightedDivMMTest(TEST_NAME3, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicSparseSP() 
-	{
+	public void testWeightedDivMMMultBasicSparseSP() {
 		runWeightedDivMMTest(TEST_NAME3, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultBasicDenseSPRep() 
-	{
+	public void testWeightedDivMMMultBasicDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME3, false, true, true, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftDenseSP() 
-	{
+	public void testWeightedDivMMMultLeftDenseSP() {
 		runWeightedDivMMTest(TEST_NAME4, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftSparseSP() 
-	{
+	public void testWeightedDivMMMultLeftSparseSP() {
 		runWeightedDivMMTest(TEST_NAME4, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultLeftDenseSPRep() 
-	{
+	public void testWeightedDivMMMultLeftDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME4, false, true, true, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightDenseSP() 
-	{
+	public void testWeightedDivMMMultRightDenseSP() {
 		runWeightedDivMMTest(TEST_NAME5, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightSparseSP() 
-	{
+	public void testWeightedDivMMMultRightSparseSP() {
 		runWeightedDivMMTest(TEST_NAME5, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultRightDenseSPRep() 
-	{
+	public void testWeightedDivMMMultRightDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME5, false, true, true, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftDenseSP() 
-	{
+	public void testWeightedDivMMMultMinusLeftDenseSP() {
 		runWeightedDivMMTest(TEST_NAME6, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftSparseSP() 
-	{
+	public void testWeightedDivMMMultMinusLeftSparseSP() {
 		runWeightedDivMMTest(TEST_NAME6, true, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusLeftDenseSPRep() 
-	{
+	public void testWeightedDivMMMultMinusLeftDenseSPRep() {
 		runWeightedDivMMTest(TEST_NAME6, false, true, true, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightDenseSP() 
-	{
+	public void testWeightedDivMMMultMinusRightDenseSP() {
 		runWeightedDivMMTest(TEST_NAME7, false, true, false, ExecType.SPARK);
 	}
 	
 	@Test
-	public void testWeightedDivMMMultMinusRightSparseSP() 
-	{
+	public void testWeightedDivMMMultMinusRightSparseSP() {
 		runWeightedDivMMTest(TEST_NAME7, true, true, false, ExecType.SPARK);
 	}
 	
@@ -435,6 +434,36 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 	{
 		runWeightedDivMMTest(TEST_NAME7, false, true, true, ExecType.SPARK);
 	}
+
+	@Test
+	public void testWeightedDivMM4MultMinusLeftDenseSP() {
+		runWeightedDivMMTest(TEST_NAME8, false, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusLeftSparseSP() {
+		runWeightedDivMMTest(TEST_NAME8, true, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusLeftDenseSPRep() {
+		runWeightedDivMMTest(TEST_NAME8, false, true, true, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightDenseSP()  {
+		runWeightedDivMMTest(TEST_NAME9, false, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightSparseSP() {
+		runWeightedDivMMTest(TEST_NAME9, true, true, false, ExecType.SPARK);
+	}
+	
+	@Test
+	public void testWeightedDivMM4MultMinusRightDenseSPRep() {
+		runWeightedDivMMTest(TEST_NAME9, false, true, true, ExecType.SPARK);
+	}
 	
 	/**
 	 * 
@@ -464,18 +493,14 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 		try
 		{
 			boolean basic = testname.equals(TEST_NAME3);
-			boolean left = testname.equals(TEST_NAME1) || testname.equals(TEST_NAME4) || testname.equals(TEST_NAME6);
+			boolean left = testname.equals(TEST_NAME1) || testname.equals(TEST_NAME4) 
+					|| testname.equals(TEST_NAME6) || testname.equals(TEST_NAME8);
 			double sparsity = (sparse) ? spSparse : spDense;
 			String TEST_NAME = testname;
+			String TEST_CACHE_DIR = TEST_CACHE_ENABLED ? 
+					TEST_CACHE_DIR = TEST_NAME + "_" + sparsity + "/" : "";
 			
 			TestConfiguration config = getTestConfiguration(TEST_NAME);
-			
-			String TEST_CACHE_DIR = "";
-			if (TEST_CACHE_ENABLED)
-			{
-				TEST_CACHE_DIR = TEST_NAME + "_" + sparsity + "/";
-			}
-			
 			loadTestConfiguration(config, TEST_CACHE_DIR);
 			
 			// This is for running the junit test the new way, i.e., construct the arguments directly
@@ -509,7 +534,8 @@ public class WeightedDivMatrixMultTest extends AutomatedTestBase
 				Assert.assertTrue("Missing opcode wdivmm", Statistics.getCPHeavyHitterOpCodes().contains(WeightedDivMM.OPCODE_CP));
 			}
 			else if( instType == ExecType.SPARK && rewrites ) {
-				String opcode = Instruction.SP_INST_PREFIX + ((rep)?WeightedDivMMR.OPCODE:WeightedDivMM.OPCODE);
+				boolean reduce = rep || testname.equals(TEST_NAME8) || testname.equals(TEST_NAME9);
+				String opcode = Instruction.SP_INST_PREFIX + ((reduce)?WeightedDivMMR.OPCODE:WeightedDivMM.OPCODE);
 				Assert.assertTrue("Missing opcode sp_wdivmm", Statistics.getCPHeavyHitterOpCodes().contains(opcode) );
 			}
 		}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.R b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.R
new file mode 100644
index 0000000..654c625
--- /dev/null
+++ b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.R
@@ -0,0 +1,37 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+W = as.matrix(readMM(paste(args[1], "W.mtx", sep="")))
+U = as.matrix(readMM(paste(args[1], "U.mtx", sep="")))
+V = as.matrix(readMM(paste(args[1], "V.mtx", sep="")))
+
+X = W/0.7;
+R = t(t(U) %*% (W*(U%*%t(V)-X)));
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep="")); 
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.dml b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.dml
new file mode 100644
index 0000000..87c035f
--- /dev/null
+++ b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusLeft.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+
+W = read($1);
+U = read($2);
+V = read($3);
+
+X = W/0.7;
+if(1==1){}
+R = t(t(U) %*% (W*(U%*%t(V)-X)));
+
+write(R, $4);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.R b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.R
new file mode 100644
index 0000000..b300b59
--- /dev/null
+++ b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.R
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+W = as.matrix(readMM(paste(args[1], "W.mtx", sep="")))
+U = as.matrix(readMM(paste(args[1], "U.mtx", sep="")))
+V = as.matrix(readMM(paste(args[1], "V.mtx", sep="")))
+
+X = W/0.3
+R = (W*(U%*%t(V)-X)) %*% V;
+
+
+writeMM(as(R, "CsparseMatrix"), paste(args[2], "R", sep="")); 
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/401e982b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.dml b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.dml
new file mode 100644
index 0000000..7376abd
--- /dev/null
+++ b/src/test/scripts/functions/quaternary/WeightedDivMM4MultMinusRight.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+
+W = read($1);
+U = read($2);
+V = read($3);
+
+X = W/0.3
+if(1==1){}
+R = (W*(U%*%t(V)-X)) %*% V;
+
+write(R, $4);