You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/08/22 21:57:01 UTC

systemml git commit: [SYSTEMML-445] Integrate GPU exectype selection into our existing infrastructure

Repository: systemml
Updated Branches:
  refs/heads/master 4d5a82ecf -> 3ca053535


[SYSTEMML-445] Integrate GPU exectype selection into our existing infrastructure

Closes #627.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3ca05353
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3ca05353
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3ca05353

Branch: refs/heads/master
Commit: 3ca05353593e7847dc6d6a7e862e323ffa96bfcc
Parents: 4d5a82e
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Tue Aug 22 14:55:37 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Tue Aug 22 14:55:37 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java | 53 +++++++++-----
 .../java/org/apache/sysml/hops/AggUnaryOp.java  | 53 ++++++++------
 .../java/org/apache/sysml/hops/BinaryOp.java    | 77 +++++++++++++-------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 12 ++-
 .../java/org/apache/sysml/hops/DataGenOp.java   |  6 ++
 src/main/java/org/apache/sysml/hops/DataOp.java |  5 ++
 .../java/org/apache/sysml/hops/FunctionOp.java  |  5 ++
 src/main/java/org/apache/sysml/hops/Hop.java    | 32 +++++---
 .../java/org/apache/sysml/hops/IndexingOp.java  |  5 ++
 .../org/apache/sysml/hops/LeftIndexingOp.java   |  5 ++
 .../java/org/apache/sysml/hops/LiteralOp.java   |  5 ++
 .../java/org/apache/sysml/hops/MultipleOp.java  |  5 ++
 .../sysml/hops/ParameterizedBuiltinOp.java      |  5 ++
 .../org/apache/sysml/hops/QuaternaryOp.java     |  5 ++
 .../java/org/apache/sysml/hops/ReorgOp.java     | 34 +++++++--
 .../java/org/apache/sysml/hops/TernaryOp.java   | 30 +++++---
 .../java/org/apache/sysml/hops/UnaryOp.java     | 38 +++++++---
 .../apache/sysml/hops/codegen/SpoofFusedOp.java |  5 ++
 18 files changed, 278 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index 4f709b4..11a2399 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -48,7 +48,6 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;
@@ -143,6 +142,33 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 		return _method;
 	}
 	
+	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		
+		Hop input1 = getInput().get(0);
+		Hop input2 = getInput().get(1);
+		//matrix mult operation selection part 2 (specific pattern)
+		MMTSJType mmtsj = checkTransposeSelf(); //determine tsmm pattern
+		ChainType chain = checkMapMultChain(); //determine mmchain pattern
+		
+		_method = optFindMMultMethodCP ( input1.getDim1(), input1.getDim2(),   
+			      input2.getDim1(), input2.getDim2(), mmtsj, chain, _hasLeftPMInput );
+		switch( _method ){
+			case TSMM: 
+				return true;
+			case MAPMM_CHAIN:
+				return false;
+			case PMM:
+				return false;
+			case MM:
+				return true;
+			default:
+				throw new RuntimeException("Unsupported method:" + _method);
+		}
+	}
+	
 	/**
 	 * NOTE: overestimated mem in case of transpose-identity matmult, but 3/2 at worst
 	 *       and existing mem estimate advantageous in terms of consistency hops/lops,
@@ -169,7 +195,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 			MMTSJType mmtsj = checkTransposeSelf(); //determine tsmm pattern
 			ChainType chain = checkMapMultChain(); //determine mmchain pattern
 			
-			if( et == ExecType.CP ) 
+			if( et == ExecType.CP || et == ExecType.GPU ) 
 			{
 				//matrix mult operation selection part 3 (CP type)
 				_method = optFindMMultMethodCP ( input1.getDim1(), input1.getDim2(),   
@@ -178,7 +204,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 				//dispatch CP lops construction 
 				switch( _method ){
 					case TSMM: 
-						constructCPLopsTSMM( mmtsj );
+						constructCPLopsTSMM( mmtsj, et );
 						break;
 					case MAPMM_CHAIN:
 						constructCPLopsMMChain( chain );
@@ -187,7 +213,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 						constructCPLopsPMM();
 						break;
 					case MM:
-						constructCPLopsMM();
+						constructCPLopsMM(et);
 						break;
 					default:
 						throw new HopsException(this.printErrorLocation() + "Invalid Matrix Mult Method (" + _method + ") while constructing CP lops.");
@@ -344,7 +370,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 	{
 		double ret = 0;
 
-		if (DMLScript.USE_ACCELERATOR) {
+		if (isGPUEnabled()) {
 			// In GPU Mode, intermediate memory is only needed in case of one of the matrix blocks is sparse
 			// When sparse block is converted to dense and a dense MM takes place, we need (dim1 * dim2)
 			// When dense block is converted to sparse and a sparse MM takes place, we need (dim1 * dim2 * 2)
@@ -581,17 +607,11 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 	// CP Lops generation
 	/////////////////////////
 	
-	private void constructCPLopsTSMM( MMTSJType mmtsj ) 
+	private void constructCPLopsTSMM( MMTSJType mmtsj, ExecType et ) 
 		throws HopsException, LopsException
 	{
 		int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
 		
-		ExecType et = ExecType.CP;
-		if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-				|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
-			et = ExecType.GPU;
-		}
-		
 		Lop matmultCP = new MMTSJ(getInput().get(mmtsj.isLeft()?1:0).constructLops(),
 				                 getDataType(), getValueType(), et, mmtsj, false, k);
 	
@@ -662,13 +682,12 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 		HopRewriteUtils.removeChildReference(pmInput, nrow);
 	}
 
-	private void constructCPLopsMM() 
+	private void constructCPLopsMM(ExecType et) 
 		throws HopsException, LopsException
 	{	
 		Lop matmultCP = null;
 
-		if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-				|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
+		if (et == ExecType.GPU) {
 			Hop h1 = getInput().get(0);
 			Hop h2 = getInput().get(1);
 			Lop left; Lop right;
@@ -691,7 +710,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 			}
 			
 			matmultCP = new Binary(left, right, 
-									 Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.GPU, isLeftTransposed, isRightTransposed);
+									 Binary.OperationTypes.MATMULT, getDataType(), getValueType(), et, isLeftTransposed, isRightTransposed);
 			setOutputDimensions(matmultCP);
 			setNnz(-1);
 		}
@@ -702,7 +721,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop
 			else { 
 				int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
 				matmultCP = new Binary(getInput().get(0).constructLops(),getInput().get(1).constructLops(), 
-										 Binary.OperationTypes.MATMULT, getDataType(), getValueType(), ExecType.CP, k);
+										 Binary.OperationTypes.MATMULT, getDataType(), getValueType(), et, k);
 			}
 			setOutputDimensions(matmultCP);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 7a6d463..4f5e2bc 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -38,7 +38,6 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 
@@ -109,6 +108,30 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		
+		try {
+			if( isTernaryAggregateRewriteApplicable() || isUnaryAggregateOuterCPRewriteApplicable() ) {
+				return false;
+			}
+			else if ((_op == AggOp.SUM    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.MAX    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.MIN    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.MEAN   && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.VAR    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
+					 || (_op == AggOp.PROD   && (_direction == Direction.RowCol))){
+				return true;
+			}
+		} catch (HopsException e) {
+			throw new RuntimeException(e);
+		}
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException 
 	{	
@@ -121,10 +144,10 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 			ExecType et = optFindExecType();
 			Hop input = getInput().get(0);
 			
-			if ( et == ExecType.CP ) 
+			if ( et == ExecType.CP || et == ExecType.GPU ) 
 			{
 				Lop agg1 = null;
-				if( isTernaryAggregateRewriteApplicable(et) ) {
+				if( isTernaryAggregateRewriteApplicable() ) {
 					agg1 = constructLopsTernaryAggregateRewrite(et);
 				}
 				else if( isUnaryAggregateOuterCPRewriteApplicable() )
@@ -149,20 +172,6 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 				}				
 				else { //general case		
 					int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-					if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-							|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
-						// Only implemented methods for GPU
-						if ((_op == AggOp.SUM    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.MAX    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.MIN    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.MEAN   && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.VAR    && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-						 || (_op == AggOp.PROD   && (_direction == Direction.RowCol))){
-							et = ExecType.GPU;
-							k = 1;
-						}
-					}
 					agg1 = new PartialAggregate(input.constructLops(), 
 							HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k);
 				}
@@ -251,7 +260,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 				DirectionTypes dir = HopsDirection2Lops.get(_direction);
 
 				//unary aggregate
-				if( isTernaryAggregateRewriteApplicable(et) ) 
+				if( isTernaryAggregateRewriteApplicable() ) 
 				{
 					Lop aggregate = constructLopsTernaryAggregateRewrite(et);
 					setOutputDimensions(aggregate); //0x0 (scalar)
@@ -330,7 +339,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
 	{
 		double sparsity = -1;
-		if (DMLScript.USE_ACCELERATOR) {
+		if (isGPUEnabled()) {
 			// The GPU version (for the time being) only does dense outputs
 			sparsity = 1.0;
 		} else {
@@ -373,7 +382,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 				break;
 			case VAR:
 				//worst-case correction LASTFOURROWS / LASTFOURCOLUMNS
-				if (DMLScript.USE_ACCELERATOR) {
+				if (isGPUEnabled()) {
 					// The GPU implementation only operates on dense data
 					// It allocates 2 dense blocks to help with these ops:
 					// Assume Y = var(X) Or colVars(X), Or rowVars(X)
@@ -506,7 +515,7 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 			return SparkAggType.MULTI_BLOCK;
 	}
 
-	private boolean isTernaryAggregateRewriteApplicable(ExecType et) 
+	private boolean isTernaryAggregateRewriteApplicable() 
 		throws HopsException 
 	{
 		boolean ret = false;
@@ -726,6 +735,8 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop
 		// The execution type of a unary aggregate instruction should depend on the execution type of inputs to avoid OOM
 		// Since we only support matrix-vector and not vector-matrix, checking the execution type of input1 should suffice.
 		ExecType et_input = input1.optFindExecType();
+		// Because ternary aggregate are not supported on GPU
+		et_input = et_input == ExecType.GPU ? ExecType.CP :  et_input;
 		DirectionTypes dir = HopsDirection2Lops.get(_direction);
 		
 		return new TernaryAggregate(in1, in2, in3, Aggregate.OperationTypes.KahanSum, 

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index 54c06f7..ad9f0ad 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -53,7 +53,6 @@ import org.apache.sysml.lops.UnaryCP;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;
 
@@ -134,6 +133,56 @@ public class BinaryOp extends Hop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		
+		switch(op) 
+		{
+			case IQM:
+			case CENTRALMOMENT:
+			case COVARIANCE:
+			case QUANTILE:
+			case INTERQUANTILE:
+			case MEDIAN:
+				return false;
+			case CBIND: 
+			case RBIND: {
+				DataType dt1 = getInput().get(0).getDataType();
+				return dt1 == DataType.MATRIX; // only matrix cbind, rbind supported on GPU
+			}
+			default: {
+				DataType dt1 = getInput().get(0).getDataType();
+				DataType dt2 = getInput().get(1).getDataType();
+				
+				boolean isMatrixScalar = (dt1 == DataType.MATRIX && dt2 == DataType.SCALAR) || (dt1 == DataType.SCALAR && dt2 == DataType.MATRIX);
+				boolean isMatrixMatrix = (dt1 == DataType.MATRIX && dt2 == DataType.MATRIX);
+				
+				OpOp2 [] supportedOps = { OpOp2.MULT, OpOp2.PLUS, OpOp2.MINUS, OpOp2.DIV, OpOp2.POW, OpOp2.MINUS1_MULT, 
+						OpOp2.MODULUS, OpOp2.INTDIV, OpOp2.LESS, OpOp2.LESSEQUAL, OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
+			
+				if(isMatrixScalar && op == OpOp2.MINUS_NZ) {
+					// Only supported for matrix scalar:
+					return true;
+				}
+				else if(isMatrixMatrix && op == OpOp2.SOLVE) {
+					// Only supported for matrix matrix:
+					return true;
+				}
+				else if(isMatrixScalar || isMatrixMatrix) {
+					for(OpOp2 supportedOp : supportedOps) {
+						if(op == supportedOp)
+							return true;
+					}
+					return false;
+				}
+				else
+					return false;
+			}
+		}
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException 
 	{	
@@ -527,11 +576,6 @@ public class BinaryOp extends Hop
 			}
 			else //CP
 			{
-				if (DMLScript.USE_ACCELERATOR && dt1 == DataType.MATRIX && (DMLScript.FORCE_ACCELERATOR
-						|| getMemEstimate() < GPUContextPool.initialGPUMemBudget())) {
-					et = ExecType.GPU;
-				}
-
 				Lop offset = createOffsetLop( getInput().get(0), cbind ); //offset 1st input
 				append = new Append(getInput().get(0).constructLops(), getInput().get(1).constructLops(), offset, getDataType(), getValueType(), cbind, et);
 				append.getOutputParameters().setDimensions(rlen, clen, getRowsInBlock(), getColsInBlock(), getNnz());
@@ -582,14 +626,6 @@ public class BinaryOp extends Hop
 			else //general case
 				ot = HopsOpOp2LopsU.get(op);
 
-			if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-					|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))
-					&& (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW
-					|| op == OpOp2.MINUS_NZ || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV
-					|| op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL
-					|| op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) {
-				et = ExecType.GPU;
-			}
 			Unary unary1 = new Unary(getInput().get(0).constructLops(),
 						   getInput().get(1).constructLops(), ot, getDataType(), getValueType(), et);
 		
@@ -602,17 +638,8 @@ public class BinaryOp extends Hop
 		{
 			// Both operands are Matrixes
 			ExecType et = optFindExecType();
-			if ( et == ExecType.CP ) 
+			if ( et == ExecType.CP || et == ExecType.GPU ) 
 			{
-				if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-						|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))
-						&& (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW
-						|| op == OpOp2.SOLVE || op == OpOp2.MINUS1_MULT || op == OpOp2.MODULUS || op == OpOp2.INTDIV
-						|| op == OpOp2.LESS || op == OpOp2.LESSEQUAL || op == OpOp2.EQUAL || op == OpOp2.NOTEQUAL
-						|| op == OpOp2.GREATER || op == OpOp2.GREATEREQUAL)) {
-					et = ExecType.GPU;
-				}
-				
 				Lop binary = null;
 				
 				boolean isLeftXGt = (getInput().get(0) instanceof BinaryOp) && ((BinaryOp) getInput().get(0)).getOp() == OpOp2.GREATER;
@@ -827,7 +854,7 @@ public class BinaryOp extends Hop
 			ret = getInput().get(0).getMemEstimate() * 3; 
 		}
 		else if ( op == OpOp2.SOLVE ) {
-			if (DMLScript.USE_ACCELERATOR) {
+			if (isGPUEnabled()) {
 				// Solve on the GPU takes an awful lot of intermediate space
 				// First the inputs are converted from row-major to column major
 				// Then a workspace and a temporary output (workSize, tauSize) are needed

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index a3d8a81..2b9335c 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.ConvolutionTransform.OperationTypes;
@@ -79,6 +80,13 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		return true;
+	}
+	
+	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException 
 	{
@@ -315,12 +323,12 @@ public class ConvolutionOp extends Hop  implements MultiThreadedHop
 		
 		if( _etypeForced != null ) 			
 		{
-			_etype = findGPUExecTypeByMemEstimate(_etypeForced);
+			_etype = _etypeForced;
 		}
 		else 
 		{	
 			if ( OptimizerUtils.isMemoryBasedOptLevel() ) {
-				_etype = findGPUExecTypeByMemEstimate(findExecTypeByMemEstimate());
+				_etype = findExecTypeByMemEstimate();
 			}
 			else {
 				_etype = REMOTE;

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/DataGenOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/DataGenOp.java b/src/main/java/org/apache/sysml/hops/DataGenOp.java
index ce08dbc..89a5814 100644
--- a/src/main/java/org/apache/sysml/hops/DataGenOp.java
+++ b/src/main/java/org/apache/sysml/hops/DataGenOp.java
@@ -146,6 +146,11 @@ public class DataGenOp extends Hop implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException
 	{
@@ -502,4 +507,5 @@ public class DataGenOp extends Hop implements MultiThreadedHop
 		
 		return ret;
 	}
+
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/DataOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/DataOp.java b/src/main/java/org/apache/sysml/hops/DataOp.java
index bcded04..f410210 100644
--- a/src/main/java/org/apache/sysml/hops/DataOp.java
+++ b/src/main/java/org/apache/sysml/hops/DataOp.java
@@ -241,6 +241,11 @@ public class DataOp extends Hop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops()
 			throws HopsException, LopsException 
 	{	

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/FunctionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/FunctionOp.java b/src/main/java/org/apache/sysml/hops/FunctionOp.java
index c677bb8..3ad2d15 100644
--- a/src/main/java/org/apache/sysml/hops/FunctionOp.java
+++ b/src/main/java/org/apache/sysml/hops/FunctionOp.java
@@ -209,6 +209,11 @@ public class FunctionOp extends Hop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException 
 	{

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index bfbdbaf..1cf875f 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -192,7 +192,9 @@ public abstract class Hop
 	
 	public void checkAndSetForcedPlatform()
 	{
-		if ( DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE )
+		if(DMLScript.USE_ACCELERATOR && DMLScript.FORCE_ACCELERATOR && isGPUEnabled())
+			_etypeForced = ExecType.GPU;
+		else if ( DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE )
 			_etypeForced = ExecType.CP;
 		else if ( DMLScript.rtplatform == RUNTIME_PLATFORM.HADOOP )
 			_etypeForced = ExecType.MR;
@@ -768,8 +770,12 @@ public abstract class Hop
 	protected ExecType findExecTypeByMemEstimate() {
 		ExecType et = null;
 		char c = ' ';
-		if ( getMemEstimate() < OptimizerUtils.getLocalMemBudget() ) {
-			et = ExecType.CP;
+		double memEst = getMemEstimate();
+		if ( memEst < OptimizerUtils.getLocalMemBudget() ) {
+			if (DMLScript.USE_ACCELERATOR && isGPUEnabled() && memEst < GPUContextPool.initialGPUMemBudget())
+				et = ExecType.GPU;
+			else
+				et = ExecType.CP;
 		}
 		else {
 			if( DMLScript.rtplatform == DMLScript.RUNTIME_PLATFORM.HYBRID )
@@ -788,14 +794,6 @@ public abstract class Hop
 		
 		return et;
 	}
-	
-	protected ExecType findGPUExecTypeByMemEstimate(ExecType et) {
-		if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-				|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
-			return ExecType.GPU;
-		}
-		return et;
-	}
 
 	public ArrayList<Hop> getParent() {
 		return _parent;
@@ -850,6 +848,18 @@ public abstract class Hop
 	
 	public abstract String getOpString();
 
+	/**
+	 * In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()), 
+	 * the exectype is determined by checking this method as well as memory budget of this Hop. 
+	 * Please see findExecTypeByMemEstimate for more detail. 
+	 * 
+	 * This method is necessary because not all operator are supported efficiently
+	 * on GPU (for example: operations on frames and scalar as well as operations such as table). 
+	 * 
+	 * @return true if the Hop is eligible for GPU Exectype.
+	 */
+	public abstract boolean isGPUEnabled();
+	
 	protected boolean isVector() {
 		return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) );
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/IndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java
index 5a27ed6..5f2ce34 100644
--- a/src/main/java/org/apache/sysml/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java
@@ -94,6 +94,11 @@ public class IndexingOp extends Hop
 	public void setColLowerEqualsUpper(boolean passed) {
 		_colLowerEqualsUpper = passed;
 	}
+	
+	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
 
 	@Override
 	public Lop constructLops()

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java b/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java
index a641622..02e7753 100644
--- a/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/LeftIndexingOp.java
@@ -99,6 +99,11 @@ public class LeftIndexingOp  extends Hop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException 
 	{			

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/LiteralOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/LiteralOp.java b/src/main/java/org/apache/sysml/hops/LiteralOp.java
index b96d032..16ebf1b 100644
--- a/src/main/java/org/apache/sysml/hops/LiteralOp.java
+++ b/src/main/java/org/apache/sysml/hops/LiteralOp.java
@@ -73,6 +73,11 @@ public class LiteralOp extends Hop
 	public void checkArity() throws HopsException {
 		HopsException.check(_input.isEmpty(), this, "should have 0 inputs but has %d inputs", _input.size());
 	}
+	
+	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
 
 	@Override
 	public Lop constructLops()

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/MultipleOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/MultipleOp.java b/src/main/java/org/apache/sysml/hops/MultipleOp.java
index 5fb6b29..5c178c0 100644
--- a/src/main/java/org/apache/sysml/hops/MultipleOp.java
+++ b/src/main/java/org/apache/sysml/hops/MultipleOp.java
@@ -80,6 +80,11 @@ public class MultipleOp extends Hop {
 	public String getOpString() {
 		return "m(" + _op.name().toLowerCase() + ")";
 	}
+	
+	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
 
 	/**
 	 * Construct the corresponding Lops for this Hop

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java b/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java
index ab276d7..a611893 100644
--- a/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java
+++ b/src/main/java/org/apache/sysml/hops/ParameterizedBuiltinOp.java
@@ -175,6 +175,11 @@ public class ParameterizedBuiltinOp extends Hop implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException 
 	{		

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
index 6517de6..17188be 100644
--- a/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/QuaternaryOp.java
@@ -189,6 +189,11 @@ public class QuaternaryOp extends Hop implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException 
 	{	

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/ReorgOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java
index 3e27eb3..f0560d3 100644
--- a/src/main/java/org/apache/sysml/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java
@@ -34,7 +34,6 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.lops.Transform.OperationTypes;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 /**
@@ -129,6 +128,35 @@ public class ReorgOp extends Hop implements MultiThreadedHop
 		s += "r(" + HopsTransf2String.get(op) + ")";
 		return s;
 	}
+	
+	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		switch( op ) {
+			case TRANSPOSE: {
+				Lop lin;
+				try {
+					lin = getInput().get(0).constructLops();
+				} catch (HopsException | LopsException e) {
+					throw new RuntimeException("Unable to create child lop", e);
+				}
+				if( lin instanceof Transform && ((Transform)lin).getOperationType()==OperationTypes.Transpose )
+					return false; //if input is already a transpose, avoid redundant transpose ops
+				else if( getDim1()==1 && getDim2()==1 )
+					return false; //if input of size 1x1, avoid unnecessary transpose
+				else
+					return true;
+			}
+			case DIAG:
+			case REV:
+			case RESHAPE:
+			case SORT:
+				return false;
+			default:
+				throw new RuntimeException("Unsupported operator:" + op.name());
+		}
+	}
 
 	@Override
 	public Lop constructLops()
@@ -151,10 +179,6 @@ public class ReorgOp extends Hop implements MultiThreadedHop
 					setLops(lin); //if input of size 1x1, avoid unnecessary transpose
 				else { //general case
 					int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-					if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-							|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
-						et = ExecType.GPU;
-					}
 					Transform transform1 = new Transform( lin, 
 							HopsTransf2Lops.get(op), getDataType(), getValueType(), et, k);
 					setOutputDimensions(transform1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/TernaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/TernaryOp.java b/src/main/java/org/apache/sysml/hops/TernaryOp.java
index 98c8ad3..47b012e 100644
--- a/src/main/java/org/apache/sysml/hops/TernaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/TernaryOp.java
@@ -42,7 +42,6 @@ import org.apache.sysml.lops.PartialAggregate.CorrectionLocationType;
 import org.apache.sysml.parser.Statement;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
-import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 /** Primary use cases for now, are
@@ -128,6 +127,25 @@ public class TernaryOp extends Hop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		switch( _op ) {
+			case CENTRALMOMENT:
+			case COVARIANCE:
+			case CTABLE:
+			case INTERQUANTILE:
+			case QUANTILE:
+				return false;
+			case MINUS_MULT:
+			case PLUS_MULT:
+				return true;
+			default:
+				throw new RuntimeException("Unsupported operator:" + _op.name());
+		}
+	}
+	
+	@Override
 	public Lop constructLops() 
 		throws HopsException, LopsException 
 	{	
@@ -631,13 +649,7 @@ public class TernaryOp extends Hop
 		if ( _op != OpOp3.PLUS_MULT && _op != OpOp3.MINUS_MULT )
 			throw new HopsException("Unexpected operation: " + _op + ", expecting " + OpOp3.PLUS_MULT + " or" +  OpOp3.MINUS_MULT);
 		
-		ExecType et = null;
-		if (DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR
-				|| getMemEstimate() < Math.min(GPUContextPool.initialGPUMemBudget(), OptimizerUtils.getLocalMemBudget()))) {
-			et = ExecType.GPU;
-		} else {
-			et = optFindExecType();
-		}
+		ExecType et = optFindExecType();
 		PlusMult plusmult = null;
 		
 		if( et == ExecType.CP || et == ExecType.SPARK || et == ExecType.GPU ) {
@@ -711,7 +723,7 @@ public class TernaryOp extends Hop
 				return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, 1.0);
 			case PLUS_MULT:
 			case MINUS_MULT: {
-				if (DMLScript.USE_ACCELERATOR) {
+				if (isGPUEnabled()) {
 					// For the GPU, the input is converted to dense
 					sparsity = 1.0;
 				} else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/UnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/UnaryOp.java b/src/main/java/org/apache/sysml/hops/UnaryOp.java
index 2b31247..0a5bc65 100644
--- a/src/main/java/org/apache/sysml/hops/UnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/UnaryOp.java
@@ -99,6 +99,29 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 	}
 	
 	@Override
+	public boolean isGPUEnabled() {
+		if(!DMLScript.USE_ACCELERATOR)
+			return false;
+		boolean isScalar = (    getDataType() == DataType.SCALAR //value type casts or matrix to scalar
+				|| (_op == OpOp1.CAST_AS_MATRIX && getInput().get(0).getDataType()==DataType.SCALAR)
+				|| (_op == OpOp1.CAST_AS_FRAME && getInput().get(0).getDataType()==DataType.SCALAR));
+		if(!isScalar) {
+			switch(_op) {
+				case SELP:case EXP:case SQRT:case LOG:case ABS:
+				case ROUND:case FLOOR:case CEIL:
+				case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
+				case SIGN:
+					return true;
+				default:
+					return false;
+			}
+		}
+		else  {
+			return false;
+		}
+	}
+	
+	@Override
 	public Lop constructLops()
 		throws HopsException, LopsException 
 	{		
@@ -149,7 +172,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 				ExecType et = optFindExecType();
 				
 				//special handling cumsum/cumprod/cummin/cumsum
-				if( isCumulativeUnaryOperation() && et != ExecType.CP )  
+				if( isCumulativeUnaryOperation() && !(et == ExecType.CP || et == ExecType.GPU) )  
 				{
 					//TODO additional physical operation if offsets fit in memory
 					Lop cumsumLop = null;
@@ -162,15 +185,6 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 				else //default unary 
 				{
 					int k = isCumulativeUnaryOperation() ? OptimizerUtils.getConstrainedNumThreads( _maxNumThreads ) : 1;
-					switch(_op) {
-						case SELP:case EXP:case SQRT:case LOG:case ABS:
-						case ROUND:case FLOOR:case CEIL:
-						case SIN:case COS: case TAN:case ASIN:case ACOS:case ATAN:
-						case SIGN:
-							et = findGPUExecTypeByMemEstimate(et);
-							break;
-						default:
-					}
 					Unary unary1 = new Unary(input.constructLops(), HopsOpOp1LopsU.get(_op), 
 							                 getDataType(), getValueType(), et, k);
 					setOutputDimensions(unary1);
@@ -550,7 +564,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 	protected double computeOutputMemEstimate( long dim1, long dim2, long nnz )
 	{
 		double sparsity = -1;
-		if (DMLScript.USE_ACCELERATOR) {
+		if (isGPUEnabled()) {
 			sparsity = 1.0; // Output is always dense (for now) on the GPU
 		} else {
 			sparsity = OptimizerUtils.getSparsity(dim1, dim2, nnz);
@@ -569,7 +583,7 @@ public class UnaryOp extends Hop implements MultiThreadedHop
 			ret = getInput().get(0).getMemEstimate() * 3; 
 		}
 
-		if (DMLScript.USE_ACCELERATOR) {
+		if (isGPUEnabled()) {
 			OptimizerUtils.estimateSize(dim1, dim2); // Intermediate memory required to convert sparse to dense
 		}
 		

http://git-wip-us.apache.org/repos/asf/systemml/blob/3ca05353/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
index 0d4b8db..247a142 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofFusedOp.java
@@ -283,4 +283,9 @@ public class SpoofFusedOp extends Hop implements MultiThreadedHop
 		
 		return ret;
 	}
+
+	@Override
+	public boolean isGPUEnabled() {
+		return false;
+	}
 }