You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/09/07 19:50:00 UTC
[3/5] systemml git commit: [SYSTEMML-540] Support sparse GPU conv2d
as well as fix memory estimation of convolution operations
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 2b9335c..59ac29e 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -191,7 +191,7 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
// // TODO: Inserting reblock requires knowing columns apriori
// ConvolutionTransform transform1 = new ConvolutionTransform(addReblockIfNecessary(et, lopOp, in), lopOp, getDataType(), getValueType(), et, k);
// setReblockedOutputDimension(et, transform1);
- ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k);
+ ConvolutionTransform transform1 = new ConvolutionTransform(in, lopOp, getDataType(), getValueType(), et, k, computeIntermediateMemEstimate(-1, -1, -1 ));
setOutputDimensions(transform1);
setLineNumbers(transform1);
@@ -223,13 +223,171 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
return OptimizerUtils.estimateSizeExactSparsity(dim1, dim2, sparsity);
}
+ // ---------------------------------------------------------------
+ // Utility methods to guard the computation of memory estimates in presense of unknowns
+ private static class IntermediateDimensions {
+ int dim1; int dim2; double sp;
+ public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str, double sp) {
+ dim1 = (int) h.getDim(dim1Str);
+ dim2 = (int) h.getDim(dim2Str);
+ this.sp = sp;
+ }
+ public IntermediateDimensions(ConvolutionOp h, String dim1Str, String dim2Str) {
+ dim1 = (int) h.getDim(dim1Str);
+ dim2 = (int) h.getDim(dim2Str);
+ sp = 1;
+ }
+ public IntermediateDimensions(ConvolutionOp h, int dim1, String dim2Str) {
+ this.dim1 = dim1;
+ dim2 = (int) h.getDim(dim2Str);
+ sp = 1;
+ }
+
+ /**
+ * Add two computed memory estimates
+ *
+ * @param val1 memory estimate 1
+ * @param val2 memory estimate 2
+ * @return sum of memory estimates
+ */
+ static double guardedAdd(double val1, double val2) {
+ if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+ double ret = val1 + val2;
+ if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+ else return ret;
+ }
+
+ /**
+ * Compute memory estimates for given intermediate matrices
+ *
+ * @param intermediates list of intermediates
+ * @param numWorkers number of workers
+ * @return memory estimate
+ */
+ public static double addEstimateSizes(ArrayList<IntermediateDimensions> intermediates, int numWorkers) {
+ double memBudget = 0;
+ for(int i = 0; i < intermediates.size(); i++) {
+ memBudget = guardedAdd(memBudget, OptimizerUtils.estimateSizeExactSparsity(
+ intermediates.get(i).dim1, intermediates.get(i).dim2, intermediates.get(i).sp)*numWorkers);
+ }
+ return memBudget;
+ }
+
+ /**
+ * Compute max of two computed memory estimates
+ * @param val1 memory estimate 1
+ * @param val2 memory estimate 2
+ * @return max of memory estimates
+ */
+ public static double guardedMax(double val1, double val2) {
+ if(val1 < 0 || val2 < 0) return OptimizerUtils.DEFAULT_SIZE;
+ double ret = Math.max(val1, val2);
+ if(ret >= OptimizerUtils.DEFAULT_SIZE) return OptimizerUtils.DEFAULT_SIZE;
+ else return ret;
+ }
+ }
+
+ /**
+ * Helper utility to compute intermediate memory estimate
+ *
+ * @param gpuIntermediates intermediates for GPU
+ * @param cpIntermediates intermediates for CP
+ * @return memory estimates
+ */
+ private double computeIntermediateMemEstimateHelper(
+ ArrayList<IntermediateDimensions> gpuIntermediates,
+ ArrayList<IntermediateDimensions> cpIntermediates) {
+ // Since CP operators use row-level parallelism by default
+ int numWorkers = (int) Math.min(OptimizerUtils.getConstrainedNumThreads(_maxNumThreads), Math.max(getDim("N"), 1));
+ if(DMLScript.USE_ACCELERATOR) {
+ // Account for potential sparse-to-dense conversion
+ double gpuMemBudget = IntermediateDimensions.addEstimateSizes(gpuIntermediates, 1);
+ double cpMemoryBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+ if(cpMemoryBudget > gpuMemBudget) {
+ double oneThreadCPMemBudget = IntermediateDimensions.addEstimateSizes(cpIntermediates, 1);
+ if(oneThreadCPMemBudget <= gpuMemBudget) {
+ // Why limit CPU ? in-order to give more opportunity to compile GPU operators
+ cpMemoryBudget = oneThreadCPMemBudget;
+ }
+ }
+ // Finally, use the maximum of CP and GPU memory budget
+ return IntermediateDimensions.guardedMax(cpMemoryBudget, gpuMemBudget);
+ }
+ else {
+ // When -gpu flag is not provided, the memory estimates for CP are not affected.
+ return IntermediateDimensions.addEstimateSizes(cpIntermediates, numWorkers);
+ }
+ }
+
@Override
- protected double computeIntermediateMemEstimate( long dim1, long dim2, long nnz )
+ protected double computeIntermediateMemEstimate( long ignoreDim1, long ignoreDim2, long ignoreNnz )
{
- //default: no intermediate memory requirements
- return 0;
+ ArrayList<IntermediateDimensions> gpuIntermediates = new ArrayList<IntermediateDimensions>();
+ ArrayList<IntermediateDimensions> cpIntermediates = new ArrayList<IntermediateDimensions>();
+ if(getOp() == ConvOp.DIRECT_CONV2D) {
+ // Assumption: To compile a GPU conv2d operator, following should fit on the GPU:
+ // 1. output in dense format (i.e. computeOutputMemEstimate)
+ // 2. input in any format
+ // 3. atleast one input row in dense format
+ // 4. filter in dense format
+
+ // Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+ gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+
+ // im2col operation preserves the worst-case sparsity of the input.
+ cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+ }
+ else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+ // Assumption: To compile a GPU conv2d_backward_data operator, following should fit on the GPU:
+ // 1. output in dense format (i.e. computeOutputMemEstimate)
+ // 2. dout in any format
+ // 3. atleast one dout row in dense format
+ // 4. filter in dense format
+
+ // Account for potential sparse-to-dense conversion of atleast 1 input row and filter
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+ gpuIntermediates.add(new IntermediateDimensions(this, "K", "CRS"));
+
+ // There are 2 intermediates: rotate180 and input to col2im for conv2d_backward_data
+ // rotate180 preserves the "exact" sparsity of the dout matrix
+ cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+ // Note: worst-case sparsity for the input of col2im (of size NPQ x CRS where N is determined by degree of parallelism)
+ cpIntermediates.add(new IntermediateDimensions(this, "PQ", "CRS"));
+ }
+ else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+ // Assumption: To compile a GPU conv2d_backward_filter operator, following should fit on the GPU:
+ // 1. output in dense format (i.e. computeOutputMemEstimate)
+ // 2. dout in any format
+ // 3. atleast one dout and input row in dense format
+
+ // Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "KPQ"));
+
+ // There are 2 intermediates: im2col and rotate180 for conv2d_backward_filter
+ // rotate180 preserves the "exact" sparsity of the dout matrix
+ cpIntermediates.add(new IntermediateDimensions(this, "PQ", "K", getInput().get(1).getSparsity()));
+ // im2col operation preserves the worst-case sparsity of the input.
+ cpIntermediates.add(new IntermediateDimensions(this, "CRS", "PQ", getInput().get(0).getSparsity()));
+ }
+ else if(getOp() == ConvOp.MAX_POOLING) {
+ // Account for potential sparse-to-dense conversion of atleast 1 input row
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+ }
+ else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+ // Account for potential sparse-to-dense conversion of atleast 1 input + dout row
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "CHW"));
+ gpuIntermediates.add(new IntermediateDimensions(this, 1, "CPQ"));
+ }
+
+ if(gpuIntermediates.size() > 0 || cpIntermediates.size() > 0)
+ return computeIntermediateMemEstimateHelper(gpuIntermediates, cpIntermediates);
+ else
+ return 0;
}
+
@Override
protected long[] inferOutputCharacteristics( MemoTable memo )
{
@@ -243,65 +401,9 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
ret[2] = -1;
return (ret[0]>0 && ret[1]>0) ? ret : null;
}
-
- ConvolutionParameters params;
- try {
- params = parseInput();
- } catch (DMLRuntimeException e) {
- throw new RuntimeException(e);
- }
- switch(op)
- {
- case MAX_POOLING: {
- // input
- long N = getInput().get(0)._dim1;
- ret[0] = N;
- ret[1] = getExtractedVal(params.C, params.P, params.Q);
- ret[2] = -1;
- break;
- }
- case DIRECT_CONV2D: {
- // input, filter
- long N = getInput().get(0)._dim1;
- ret[0] = N;
- ret[1] = getExtractedVal(params.K, params.P, params.Q);
- ret[2] = -1;
- break;
- }
- case DIRECT_CONV2D_BACKWARD_FILTER: {
- // input, dout
- ret[0] = params.K;
- ret[1] = getExtractedVal(params.C, params.R, params.S);
- ret[2] = -1;
- break;
- }
- case MAX_POOLING_BACKWARD: {
- // input, dout
- ret[0] = getInput().get(0)._dim1;
- ret[1] = getInput().get(0)._dim2;
- ret[2] = -1;
- break;
- }
- case DIRECT_CONV2D_BACKWARD_DATA: {
- // filter, dout
- long N = getInput().get(1)._dim1;
- ret[0] = N;
- ret[1] = getExtractedVal(params.C, params.H, params.W);
- ret[2] = -1;
- break;
- }
- default:
- throw new RuntimeException("Unsupported op:" + op.name());
- }
-
- if(LOG.isDebugEnabled() && (ret[0] <= 0 || ret[1] <= 0)) {
- LOG.debug("Unknown dimensions for ConvolutionOp in inferOutputCharacteristics:" + op.name() + " " + ret[0] + " " + ret[1] +
- " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
- " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
- " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
- " pad=[" + params.pad_h + " " + params.pad_w + "]");
- }
+ refreshSizeInformation();
+ ret[0] = _dim1; ret[1] = _dim2; ret[2] = _nnz;
//safe return (create entry only if at least dims known)
return (ret[0]>0 && ret[1]>0) ? ret : null;
@@ -347,50 +449,44 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
return _etype;
}
+ // Caching parameters speed-ups dynamic recompilation time by avoiding unnecessary computeSizeInformation
+ private ConvolutionParameters _cachedParams = new ConvolutionParameters(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, _maxNumThreads);
// stride1, stride2, padding1, padding2
// input_shape1, input_shape2, input_shape3, input_shape4,
// filter_shape1, filter_shape2, filter_shape3, filter_shape4
ConvolutionParameters parseInput() throws DMLRuntimeException {
- ConvolutionParameters params = null;
if(op == ConvOp.MAX_POOLING_BACKWARD
|| op == ConvOp.DIRECT_CONV2D
|| op == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER
|| op == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
- params = new ConvolutionParameters(
- computeSizeInformation(getInput().get(6)),
- computeSizeInformation(getInput().get(7)),
- computeSizeInformation(getInput().get(8)),
- computeSizeInformation(getInput().get(9)),
- computeSizeInformation(getInput().get(10)),
- computeSizeInformation(getInput().get(12)),
- computeSizeInformation(getInput().get(13)),
- computeSizeInformation(getInput().get(2)),
- computeSizeInformation(getInput().get(3)),
- computeSizeInformation(getInput().get(4)),
- computeSizeInformation(getInput().get(5)), _maxNumThreads);
+ _cachedParams.setIfUnknown(
+ getInput().get(6),
+ getInput().get(7),
+ getInput().get(8),
+ getInput().get(9),
+ getInput().get(10),
+ getInput().get(12),
+ getInput().get(13),
+ getInput().get(2),
+ getInput().get(3),
+ getInput().get(4),
+ getInput().get(5), _maxNumThreads);
}
else {
- params = new ConvolutionParameters(
- computeSizeInformation(getInput().get(5)),
- computeSizeInformation(getInput().get(6)),
- computeSizeInformation(getInput().get(7)),
- computeSizeInformation(getInput().get(8)),
- computeSizeInformation(getInput().get(9)),
- computeSizeInformation(getInput().get(11)),
- computeSizeInformation(getInput().get(12)),
- computeSizeInformation(getInput().get(1)),
- computeSizeInformation(getInput().get(2)),
- computeSizeInformation(getInput().get(3)),
- computeSizeInformation(getInput().get(4)), _maxNumThreads);
- }
- return params;
- }
-
- public static long getExtractedVal(long val1, long val2, long val3) {
- if(val1 == -1 || val2 == -1 || val3 == -1) {
- return -1;
+ _cachedParams.setIfUnknown(
+ getInput().get(5),
+ getInput().get(6),
+ getInput().get(7),
+ getInput().get(8),
+ getInput().get(9),
+ getInput().get(11),
+ getInput().get(12),
+ getInput().get(1),
+ getInput().get(2),
+ getInput().get(3),
+ getInput().get(4), _maxNumThreads);
}
- return val1*val2*val3;
+ return _cachedParams;
}
@Override
@@ -400,72 +496,50 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
Hop input1 = getInput().get(0);
setDim1(input1.getDim1());
setDim2(input1.getDim2());
+ _nnz = -1; // cannot infer stats
return;
}
- ConvolutionParameters params;
- try {
- params = parseInput();
- } catch (DMLRuntimeException e) {
- throw new RuntimeException(e);
- }
-
switch(op)
{
case MAX_POOLING:
{
- // input
- long N = getInput().get(0)._dim1;
- _dim1 = N;
- _dim2 = getExtractedVal(params.C, params.P, params.Q);
+ _dim1 = getDim("N");
+ _dim2 = getDim("CPQ");
_nnz = -1; // cannot infer stats
break;
}
case MAX_POOLING_BACKWARD:
{
- // input, dout
- _dim1 = getInput().get(0)._dim1;
- _dim2 = getInput().get(0)._dim2;
+ _dim1 = getDim("N");
+ _dim2 = getDim("CHW");
_nnz = -1;
break;
}
case DIRECT_CONV2D:
{
- // input, filter
- long N = getInput().get(0)._dim1;
- _dim1 = N;
- _dim2 = getExtractedVal(params.K, params.P, params.Q);
+ _dim1 = getDim("N");
+ _dim2 = getDim("KPQ");
_nnz = -1; // cannot infer stats
break;
}
case DIRECT_CONV2D_BACKWARD_DATA:
{
- // filter, dout
- long N = getInput().get(1)._dim1;
- _dim1 = N;
- _dim2 = getExtractedVal(params.C, params.H, params.W);
+ _dim1 = getDim("N");
+ _dim2 = getDim("CHW");
_nnz = -1; // cannot infer stats
break;
}
case DIRECT_CONV2D_BACKWARD_FILTER:
{
- // input, dout
- _dim1 = params.K;
- _dim2 = getExtractedVal(params.C, params.R, params.S);
+ _dim1 = getDim("K");
+ _dim2 = getDim("CRS");
_nnz = -1; // cannot infer stats
break;
}
default:
throw new RuntimeException("The sizes are not refreshed for " + op.name());
}
-
- if(LOG.isDebugEnabled() && (_dim1 <= 0 || _dim2 <= 0)) {
- LOG.debug("Unknown dimensions for ConvolutionOp in refreshSizeInformation:" + op.name() + " " + _dim1 + " " + _dim2 +
- " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
- " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
- " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
- " pad=[" + params.pad_h + " " + params.pad_w + "]");
- }
}
@Override
@@ -511,4 +585,132 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop
public int getMaxNumThreads() {
return _maxNumThreads;
}
+
+
+ // ------------------------------------------------------------------------------------------------------
+ // Utility methods to get the dimensions taking into account unknown dimensions
+
+ /**
+ * Convenient method to get the dimensions required by ConvolutionOp.
+ *
+ * @param dimString can be K, CRS, N, CHW, KPQ, PQ
+ * @return either -1 or value associated with the dimString
+ */
+ private long getDim(String dimString) {
+ if(op == ConvOp.BIAS_ADD || op == ConvOp.BIAS_MULTIPLY) {
+ throw new RuntimeException("getDim method should not be invoked for bias_add and bias_multiply");
+ }
+ ConvolutionParameters params;
+ try {
+ params = parseInput();
+ } catch (DMLRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+ Hop filter = null; // shape: K x CRS
+ Hop input = null; // shape: N x CHW
+ Hop dout = null; // shape: N x KPQ
+ Hop dout1 = null; // shape: N x CPQ
+
+ if(getOp() == ConvOp.DIRECT_CONV2D) {
+ input = getInput().get(0);
+ filter = getInput().get(1);
+ }
+ else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_DATA) {
+ filter = getInput().get(0);
+ dout = getInput().get(1);
+ }
+ else if(getOp() == ConvOp.DIRECT_CONV2D_BACKWARD_FILTER) {
+ input = getInput().get(0);
+ dout = getInput().get(1);
+ }
+ else if(getOp() == ConvOp.MAX_POOLING) {
+ input = getInput().get(0);
+ }
+ else if(getOp() == ConvOp.MAX_POOLING_BACKWARD) {
+ input = getInput().get(0);
+ dout1 = getInput().get(1);
+ }
+
+ long ret = -1;
+ if(dimString.equals("K") && filter != null) {
+ ret = getNonNegative(ret, getNonNegative(params.K, filter._dim1));
+ }
+ else if(dimString.equals("CRS") && filter != null) {
+ ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.R, params.S), filter._dim2));
+ }
+ else if(dimString.equals("N") && input != null) {
+ ret = getNonNegative(ret, getNonNegative(params.N, input._dim1));
+ }
+ else if(dimString.equals("CHW") && input != null) {
+ ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.H, params.W), input._dim2));
+ }
+ else if(dimString.equals("N") && dout != null) {
+ ret = getNonNegative(ret, getNonNegative(params.N, dout._dim1));
+ }
+ else if(dimString.equals("KPQ") && dout != null) {
+ ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.K, params.P, params.Q), dout._dim2));
+ }
+ else if(dimString.equals("N") && dout1 != null) {
+ ret = getNonNegative(ret, getNonNegative(params.N, dout1._dim1));
+ }
+ else if(dimString.equals("CPQ") && dout1 != null) {
+ ret = getNonNegative(ret, getNonNegative(nonNegativeMultiply(params.C, params.P, params.Q), dout1._dim2));
+ }
+ else if(dimString.equals("K")) {
+ ret = getNonNegative(ret, params.K >= 0 ? params.K : -1);
+ }
+ else if(dimString.equals("CRS")) {
+ ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.R, params.S));
+ }
+ else if(dimString.equals("N")) {
+ ret = getNonNegative(ret, params.N >= 0 ? params.N : -1);
+ }
+ else if(dimString.equals("CHW")) {
+ ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.H, params.W));
+ }
+ else if(dimString.equals("KPQ")) {
+ ret = getNonNegative(ret, nonNegativeMultiply(params.K, params.P, params.Q));
+ }
+ else if(dimString.equals("PQ")) {
+ ret = getNonNegative(ret, nonNegativeMultiply(params.P, params.Q));
+ }
+ else if(dimString.equals("CPQ")) {
+ ret = getNonNegative(ret, nonNegativeMultiply(params.C, params.P, params.Q));
+ }
+ else {
+ throw new RuntimeException("Unsupported dimension:" + dimString + " for operator " + getOp().name());
+ }
+
+ if(LOG.isDebugEnabled() && ret < 0) {
+ LOG.debug("Unknown dimension " + dimString + " for ConvolutionOp:" + op.name() +
+ " img_dim=[" + params.N + " " + params.C + " " + params.H + " " + params.W + "]" +
+ " filter_dim=[" + params.K + " " + params.C + " " + params.H + " " + params.W + "]" +
+ " output_feature_map=[" + params.P + " " + params.Q + "] stride=[" + params.stride_h + " " + params.stride_w + "]" +
+ " pad=[" + params.pad_h + " " + params.pad_w + "]");
+ }
+ return ret;
+ }
+
+ private long nonNegativeMultiply(long val1, long val2, long val3) {
+ if(val1 >= 0 && val2 >= 0 && val3 >= 0) {
+ return val1 * val2 * val3;
+ }
+ else return -1;
+ }
+ private long nonNegativeMultiply(long val1, long val2) {
+ if(val1 >= 0 && val2 >= 0) {
+ return val1 * val2;
+ }
+ else return -1;
+ }
+ private long getNonNegative(long val1, long val2) {
+ if(val1 >= 0 && val2 >= 0) {
+ if(val1 == val2) return val1;
+ else throw new RuntimeException("Incorrect dimensions in Convolution Hop: " + val1 + " != " + val2);
+ }
+ else if(val1 >= 0) return val1;
+ else if(val2 >= 0) return val2;
+ else return -1;
+ }
+ // ------------------------------------------------------------------------------------------------------
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java
index eeaa5f1..b454771 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -708,31 +708,8 @@ public abstract class Hop implements ParseInfo
_validCPSizeEstimate = (wstats!=null) ? OptimizerUtils.isValidCPMatrixSize(
wstats[0], wstats[1], OptimizerUtils.getSparsity(wstats[0], wstats[1], wstats[2])) : false;
}
-
/**
- * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
- * applicable.
- *
- * @param dim1 dimension 1
- * @param dim2 dimension 2
- * @param nnz number of non-zeros
- * @return memory estimate
- */
- protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
-
- /**
- * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
- * applicable.
- *
- * @param dim1 dimension 1
- * @param dim2 dimension 2
- * @param nnz number of non-zeros
- * @return memory estimate
- */
- protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
-
- /**
* Computes the output matrix characteristics (rows, cols, nnz) based on worst-case output
* and/or input estimates. Should return null if dimensions are unknown.
*
@@ -849,6 +826,21 @@ public abstract class Hop implements ParseInfo
public abstract String getOpString();
+ // ========================================================================================
+ // Design doc: Memory estimation of GPU
+ // 1. Since not all operator are supported on GPU, isGPUEnabled indicates whether an operation
+ // is enabled for GPU. This method doesnot take into account any memory estimates.
+ // 2. To simplify memory estimation logic, the methods computeOutputMemEstimate and computeIntermediateMemEstimate
+ // should return maximum of memory required for GPU and CP operators.
+ // 3. Additionally, these methods are guarded so that when -gpu flag is not provided, additional memory overhead due to GPU
+ // are ignored. For example: sparse-to-dense conversion on GPU.
+ // 4. (WIP) Every GPU operators should respect the memory returned by computeIntermediateMemEstimate (and computeOutputMemEstimate - see below point).
+ // 5. (WIP) Every GPU operator should create output in the same format as the corresponding CP operator. That is, computeOutputMemEstimate
+ // are consistent across both CP and GPU in terms of worst-case.
+ // 6. The drawback of using maximum memory (mem = Math.max(mem_gpu, mem_gpu)) are:
+ // - GPU operator is not selected when mem_gpu < total memory available on GPU < mem
+ // - CP operator is not selected (i.e. distributed operator compiled) when mem_cpu < driver memory budget < mem
+
/**
* In memory-based optimizer mode (see OptimizerUtils.isMemoryBasedOptLevel()),
* the exectype is determined by checking this method as well as memory budget of this Hop.
@@ -861,6 +853,31 @@ public abstract class Hop implements ParseInfo
*/
public abstract boolean isGPUEnabled();
+ /**
+ * Computes the hop-specific output memory estimate in bytes. Should be 0 if not
+ * applicable.
+ *
+ * @param dim1 dimension 1
+ * @param dim2 dimension 2
+ * @param nnz number of non-zeros
+ * @return memory estimate
+ */
+ protected abstract double computeOutputMemEstimate( long dim1, long dim2, long nnz );
+
+ /**
+ * Computes the hop-specific intermediate memory estimate in bytes. Should be 0 if not
+ * applicable.
+ *
+ * @param dim1 dimension 1
+ * @param dim2 dimension 2
+ * @param nnz number of non-zeros
+ * @return memory estimate
+ */
+ protected abstract double computeIntermediateMemEstimate( long dim1, long dim2, long nnz );
+
+ // ========================================================================================
+
+
protected boolean isVector() {
return (dimsKnown() && (_dim1 == 1 || _dim2 == 1) );
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
index 8784956..121112b 100644
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
@@ -37,6 +37,7 @@ public class ConvolutionTransform extends Lop
private OperationTypes operation = null;
private int numThreads = -1;
+ private double intermediateMemBudget = 0;
/**
* Constructor when we have one input.
@@ -47,12 +48,14 @@ public class ConvolutionTransform extends Lop
* @param vt value type
* @param et execution type
* @param k number of threads
+ * @param intermediateMemBudget intermediate memory budget
*/
- public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k)
+ public ConvolutionTransform(Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k, double intermediateMemBudget)
{
super(Lop.Type.Transform, dt, vt);
init(input, op, dt, vt, et);
numThreads = k;
+ this.intermediateMemBudget = intermediateMemBudget;
}
public ConvolutionTransform(Lop input1, Lop input2, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k)
@@ -165,6 +168,9 @@ public class ConvolutionTransform extends Lop
sb.append( OPERAND_DELIMITOR );
sb.append( numThreads );
}
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( intermediateMemBudget );
return sb.toString();
}
else {
@@ -210,6 +216,9 @@ public class ConvolutionTransform extends Lop
sb.append( OPERAND_DELIMITOR );
sb.append( numThreads );
}
+
+ sb.append( OPERAND_DELIMITOR );
+ sb.append( intermediateMemBudget );
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 629b688..e91029e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -22,6 +22,10 @@ package org.apache.sysml.runtime.instructions.cp;
import java.util.ArrayList;
import java.util.Arrays;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
@@ -41,24 +45,25 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
private ArrayList<CPOperand> _filter_shape;
private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
- private int _numThreads = -1;
-
- private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
- int numThreads) throws DMLRuntimeException {
- super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
- if (!(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply"))) {
- throw new DMLRuntimeException(
- "Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found "
- + opcode);
+ private int _numThreads = -1; private double _intermediateMemoryBudget = 0;
+ private static final Log LOG = LogFactory.getLog(ConvolutionCPInstruction.class.getName());
+ private static boolean warnedUnderUtilitization = false;
+
+ public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException {
+ super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+ opcode, istr);
+ if( !(opcode.equals("bias_add") || opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
+ throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be bias_add or bias_multiply or relu_backward, but found " + opcode);
}
_in2 = in2;
_cptype = CPINSTRUCTION_TYPE.Convolution;
_numThreads = numThreads;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr,
ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
- ArrayList<CPOperand> filter_shape, int numThreads) {
+ ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
_cptype = CPINSTRUCTION_TYPE.Convolution;
_stride = stride;
@@ -66,12 +71,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
_input_shape = input_shape;
_filter_shape = filter_shape;
_numThreads = numThreads;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
-
- private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode, String istr,
- ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
- ArrayList<CPOperand> filter_shape, int numThreads) {
- super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+
+ public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand out, String opcode,
+ String istr, ArrayList<CPOperand> stride,
+ ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+ ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+ super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+ opcode, istr);
_in2 = in2;
_cptype = CPINSTRUCTION_TYPE.Convolution;
_stride = stride;
@@ -79,12 +87,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
_input_shape = input_shape;
_filter_shape = filter_shape;
_numThreads = numThreads;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
-
- private ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
- String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
- ArrayList<CPOperand> filter_shape, int numThreads) {
- super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, opcode, istr);
+
+ public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+ String istr, ArrayList<CPOperand> stride,
+ ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+ ArrayList<CPOperand> filter_shape, int numThreads, double intermediateMemoryBudget) {
+ super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out,
+ opcode, istr);
_in2 = in2;
_in3 = in3;
_cptype = CPINSTRUCTION_TYPE.Convolution;
@@ -93,6 +104,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
_input_shape = input_shape;
_filter_shape = filter_shape;
_numThreads = numThreads;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
public static ConvolutionCPInstruction parseInstruction(String str)
@@ -101,7 +113,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
String opcode = parts[0];
if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
- InstructionUtils.checkNumFields(parts, 15);
+ InstructionUtils.checkNumFields(parts, 16);
// stride1, stride2, padding1, padding2
// input_shape1, input_shape2, input_shape3, input_shape4,
// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -127,13 +139,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
int k = Integer.parseInt(parts[15]);
return new ConvolutionCPInstruction(in, out, opcode, str, stride,
- padding, input_shape, filter_shape, k);
+ padding, input_shape, filter_shape, k, Double.parseDouble(parts[16]));
}
else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
|| opcode.equalsIgnoreCase("conv2d")
|| opcode.equalsIgnoreCase("conv2d_backward_filter")
|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
- InstructionUtils.checkNumFields(parts, 16);
+ InstructionUtils.checkNumFields(parts, 17);
// dout, stride1, stride2, padding1, padding2
// input_shape1, input_shape2, input_shape3, input_shape4,
// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -160,10 +172,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
int k = Integer.parseInt(parts[16]);
return new ConvolutionCPInstruction(in, in2, out, opcode, str, stride,
- padding, input_shape, filter_shape, k);
+ padding, input_shape, filter_shape, k, Double.parseDouble(parts[17]));
}
else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
- InstructionUtils.checkNumFields(parts, 17);
+ InstructionUtils.checkNumFields(parts, 18);
// dout, stride1, stride2, padding1, padding2
// input_shape1, input_shape2, input_shape3, input_shape4,
// filter_shape1, filter_shape2, filter_shape3, filter_shape4, k
@@ -191,15 +203,15 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
int k = Integer.parseInt(parts[17]);
return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, stride,
- padding, input_shape, filter_shape, k);
+ padding, input_shape, filter_shape, k, Double.parseDouble(parts[18]));
}
else if (opcode.equalsIgnoreCase("bias_add") || opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
- InstructionUtils.checkNumFields(parts, 4);
+ InstructionUtils.checkNumFields(parts, 5);
CPOperand in = new CPOperand(parts[1]);
CPOperand in2 = new CPOperand(parts[2]);
CPOperand out = new CPOperand(parts[3]);
int k = Integer.parseInt(parts[4]);
- return new ConvolutionCPInstruction(in, in2, out, opcode, str, k);
+ return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5]));
}
else {
throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str);
@@ -363,6 +375,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
}
else if (instOpcode.equalsIgnoreCase("conv2d")) {
+ resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
MatrixBlock filter = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
if(filter.isEmpty() || matBlock.isEmpty()) {
outputBlock = new MatrixBlock(N, K*P*Q, true);
@@ -377,6 +390,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
}
else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
+ resetNumThreads(params, C*R*S, P*Q, matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
MatrixBlock filter = ec.getMatrixInput(_in3.getName(), getExtendedOpcode());
MatrixBlock bias = ec.getMatrixInput(_in2.getName(), getExtendedOpcode());
if(bias.getNumRows() != params.K || bias.getNumColumns() != 1) {
@@ -446,6 +460,27 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode());
}
+ /**
+ * Reset the number of thread to respect the intermediate CP memory budget
+ *
+ * @param params convolution parameters
+ * @param numRows number of rows of intermediate matrix used per thread
+ * @param numCols number of rows of intermediate matrix used per thread
+ * @param sparsity sparsity of intermediate matrix used per thread
+ */
+ private void resetNumThreads(ConvolutionParameters params, int numRows, int numCols, double sparsity) {
+ if(DMLScript.USE_ACCELERATOR) {
+ double memBudget1Thread = OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity);
+ int limitedDegreeOfParallelism = (int) Math.floor(_intermediateMemoryBudget / memBudget1Thread);
+ if(params.numThreads > limitedDegreeOfParallelism) {
+ params.numThreads = limitedDegreeOfParallelism;
+ if(!warnedUnderUtilitization)
+ LOG.warn("CPU Under-utilization to respect the intermediate memory budget. To avoid this, please try reducing the mini-batch or forcing gpu execution.");
+ warnedUnderUtilitization = true;
+ }
+ }
+ }
+
private MatrixBlock getDenseOutputBlock(int numRows, int numCols) throws DMLRuntimeException {
MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false);
outputBlock.allocateDenseBlock();
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 5b37576..b25f787 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.ConvolutionUtils;
import org.apache.sysml.utils.GPUStatistics;
@@ -40,9 +41,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
private ArrayList<CPOperand> _filter_shape;
private ArrayList<CPOperand> _stride = new ArrayList<CPOperand>();
private ArrayList<CPOperand> _padding = new ArrayList<CPOperand>();
-
- private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr)
- throws DMLRuntimeException {
+ private double _intermediateMemoryBudget = 0;
+
+ public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException {
super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
if (!(opcode.equals("bias_add") || opcode.equals("bias_multiply") || opcode.equals("relu_backward"))) {
throw new DMLRuntimeException(
@@ -53,18 +54,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
_input2 = in2;
_gputype = GPUINSTRUCTION_TYPE.Convolution;
_output = out;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
-
- private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
- String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
- ArrayList<CPOperand> filter_shape) {
- this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape);
+
+ public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode,
+ String istr, ArrayList<CPOperand> stride,
+ ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+ ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget)
+ {
+ this(in1, in2, out, opcode, istr, stride, padding, input_shape, filter_shape, intermediateMemoryBudget);
_input3 = in3;
}
-
- private ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode, String istr,
- ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
- ArrayList<CPOperand> filter_shape) {
+
+ public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand out, String opcode,
+ String istr, ArrayList<CPOperand> stride,
+ ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape,
+ ArrayList<CPOperand> filter_shape, double intermediateMemoryBudget)
+ {
super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr);
_gputype = GPUINSTRUCTION_TYPE.Convolution;
@@ -75,6 +81,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
_padding = padding;
_input_shape = input_shape;
_filter_shape = filter_shape;
+ _intermediateMemoryBudget = intermediateMemoryBudget;
}
public static ConvolutionGPUInstruction parseInstruction(String str)
@@ -87,7 +94,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
|| opcode.equalsIgnoreCase("conv2d_backward_filter")
|| opcode.equalsIgnoreCase("conv2d_backward_data")
|| opcode.equalsIgnoreCase("maxpooling_backward")) ) {
- InstructionUtils.checkNumFields(parts, 15);
+ InstructionUtils.checkNumFields(parts, 16);
CPOperand in1 = new CPOperand(parts[1]);
CPOperand in2 = new CPOperand(parts[2]);
CPOperand out = new CPOperand(parts[15]);
@@ -110,10 +117,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
filter_shape.add(new CPOperand(parts[14]));
return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, stride,
- padding, input_shape, filter_shape);
+ padding, input_shape, filter_shape, Double.parseDouble(parts[16]));
}
else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
- InstructionUtils.checkNumFields(parts, 16);
+ InstructionUtils.checkNumFields(parts, 17);
CPOperand in1 = new CPOperand(parts[1]);
CPOperand in2 = new CPOperand(parts[2]);
CPOperand in3 = new CPOperand(parts[3]);
@@ -137,10 +144,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
filter_shape.add(new CPOperand(parts[15]));
return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
- padding, input_shape, filter_shape);
+ padding, input_shape, filter_shape, Double.parseDouble(parts[17]));
}
else if (opcode.equalsIgnoreCase("maxpooling")) {
- InstructionUtils.checkNumFields(parts, 14);
+ InstructionUtils.checkNumFields(parts, 15);
CPOperand in1 = new CPOperand(parts[1]);
CPOperand out = new CPOperand(parts[14]);
@@ -162,14 +169,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
filter_shape.add(new CPOperand(parts[13]));
return new ConvolutionGPUInstruction(in1, null, out, opcode, str, stride,
- padding, input_shape, filter_shape);
+ padding, input_shape, filter_shape, Double.parseDouble(parts[15]));
}
else if( opcode.equalsIgnoreCase("bias_add") || opcode.equalsIgnoreCase("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
- InstructionUtils.checkNumFields(parts, 3);
+ InstructionUtils.checkNumFields(parts, 4);
CPOperand in1 = new CPOperand(parts[1]);
CPOperand in2 = new CPOperand(parts[2]);
CPOperand out = new CPOperand(parts[3]);
- return new ConvolutionGPUInstruction(in1, in2, out, opcode, str);
+ return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4]));
}
else {
throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str);
@@ -251,8 +258,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
- LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
}
else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -266,8 +273,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, K * P * Q);
- LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
}
else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -281,8 +288,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), K, C * R * S);
- LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
// TODO: For now always copy the device data to host
// ec.gpuCtx.copyDeviceToHost(outputBlock);
}
@@ -298,8 +305,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
- LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
}
else if (instOpcode.equalsIgnoreCase("maxpooling")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -311,8 +318,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * P * Q);
if(instOpcode.equalsIgnoreCase("maxpooling"))
- LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
}
else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -326,8 +333,8 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), N, C * H * W);
- LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
+ LibMatrixCuDNN.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
+ K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, _intermediateMemoryBudget);
}
else {
throw new DMLRuntimeException("Unsupported GPU context for " + instOpcode);
@@ -345,6 +352,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction {
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
}
+
private int getScalarInput(ExecutionContext ec, ArrayList<CPOperand> aL, int index)
throws DMLRuntimeException
{
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index af27dc6..5096566 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -24,6 +24,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN;
import org.apache.sysml.runtime.matrix.operators.Operator;
import org.apache.sysml.utils.GPUStatistics;
@@ -44,7 +45,7 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
switch(opcode) {
case "sel+":
- LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCuDNN.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "exp":
LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "sqrt":
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index c6b82c4..197daaf 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -49,6 +49,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -151,6 +152,11 @@ public class GPUContext {
LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
+ this);
+ if(GPUContextPool.initialGPUMemBudget() > OptimizerUtils.getLocalMemBudget()) {
+ LOG.warn("Potential under-utilization: GPU memory (" + GPUContextPool.initialGPUMemBudget()
+ + ") > driver memory budget (" + OptimizerUtils.getLocalMemBudget() + "). "
+ + "Consider increasing the driver memory budget.");
+ }
}
private void initializeCudaLibraryHandles() throws DMLRuntimeException {
http://git-wip-us.apache.org/repos/asf/systemml/blob/772d9302/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 78b6e3b..6d06ee5 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.matrix.data;
import java.io.Serializable;
+import org.apache.sysml.hops.Hop;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.util.ConvolutionUtils;
@@ -34,7 +35,9 @@ public class ConvolutionParameters implements Serializable {
public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
public int P; public int Q; public int numThreads;
+ // Optional variables used by ConvolutionCPInstruction
public boolean enableNative = false;
+
public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
public MatrixBlock bias;
@@ -62,6 +65,28 @@ public class ConvolutionParameters implements Serializable {
"], pad=[" + pad_h + "," + pad_w + "])";
}
+ public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+ Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+ if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
+ if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
+ if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
+ if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
+ if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
+ if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
+ if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
+ if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
+ if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
+ if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
+ if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
+ if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
+ this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
+ }
+ if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
+ this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
+ }
+ this.numThreads = numThreads;
+ }
+
public ConvolutionParameters(long N, long C, long H, long W,
long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
this.N = convertToInt(N);