You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/08/13 00:25:54 UTC
incubator-systemml git commit: [SYSTEMML-540] Cleanedup memory-less
operators, fixed bufferpool bug and added direct conv2d_backward_data operator
Repository: incubator-systemml
Updated Branches:
refs/heads/master b03950c32 -> d79dea926
[SYSTEMML-540] Cleanedup memory-less operators, fixed bufferpool bug and
added direct conv2d_backward_data operator
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d79dea92
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d79dea92
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d79dea92
Branch: refs/heads/master
Commit: d79dea9266bd1177a8643f2f4525ca918b0e8b97
Parents: b03950c
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Aug 12 17:22:23 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Aug 12 17:24:19 2016 -0700
----------------------------------------------------------------------
.../controlprogram/caching/CacheableData.java | 2 +-
.../instructions/CPInstructionParser.java | 1 +
.../cp/ConvolutionCPInstruction.java | 18 +-
.../sysml/runtime/matrix/data/LibMatrixDNN.java | 684 +++++--------------
.../sysml/runtime/util/ConvolutionUtils.java | 73 +-
5 files changed, 233 insertions(+), 545 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d79dea92/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index d043879..bf22fa3 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -790,7 +790,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data
LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat);
//TODO remove
- if( getGPUObject() != null ) {
+ if( getGPUObject() != null && getGPUObject().isAllocated() ) {
getGPUObject().acquireHostRead();
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d79dea92/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index b6e0c50..ae13d3d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -226,6 +226,7 @@ public class CPInstructionParser extends InstructionParser
String2CPInstructionType.put( "maxpooling_backward" , CPINSTRUCTION_TYPE.Convolution);
String2CPInstructionType.put( "conv2d" , CPINSTRUCTION_TYPE.Convolution);
String2CPInstructionType.put( "conv2d_backward_filter" , CPINSTRUCTION_TYPE.Convolution);
+ String2CPInstructionType.put( "conv2d_backward_data" , CPINSTRUCTION_TYPE.Convolution);
// Quaternary instruction opcodes
String2CPInstructionType.put( "wsloss" , CPINSTRUCTION_TYPE.Quaternary);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d79dea92/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index c0a1af5..8324ff2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -118,7 +118,8 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
else if (opcode.equalsIgnoreCase("pooling_backward_reshape")
|| opcode.equalsIgnoreCase("maxpooling_backward")
|| opcode.equalsIgnoreCase("conv2d")
- || opcode.equalsIgnoreCase("conv2d_backward_filter")) {
+ || opcode.equalsIgnoreCase("conv2d_backward_filter")
+ || opcode.equalsIgnoreCase("conv2d_backward_data")) {
InstructionUtils.checkNumFields(parts, 16);
// dout, stride1, stride2, padding1, padding2
// input_shape1, input_shape2, input_shape3, input_shape4,
@@ -236,16 +237,21 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
MatrixBlock filter = ec.getMatrixInput(_in2.getName());
outputBlock = getDenseOutputBlock(ec, N, K*P*Q, false);
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
- boolean useMemoryLessConvolution = false;
- LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params, useMemoryLessConvolution);
+ LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
ec.releaseMatrixInput(_in2.getName());
}
else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
- MatrixBlock filter = ec.getMatrixInput(_in2.getName());
+ MatrixBlock dout = ec.getMatrixInput(_in2.getName());
outputBlock = getDenseOutputBlock(ec, K, C*R*S, false);
params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
- boolean useMemoryLessConvolution = false;
- LibMatrixDNN.conv2d_backward_filter(matBlock, filter, outputBlock, params, useMemoryLessConvolution);
+ LibMatrixDNN.conv2d_backward_filter(matBlock, dout, outputBlock, params);
+ ec.releaseMatrixInput(_in2.getName());
+ }
+ else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
+ MatrixBlock dout = ec.getMatrixInput(_in2.getName());
+ outputBlock = getDenseOutputBlock(ec, N, C * H * W, false);
+ params.setReuseNonZeroedOutput(_reuseNonZeroedOutput);
+ LibMatrixDNN.conv2d_backward_data(matBlock, dout, outputBlock, params);
ec.releaseMatrixInput(_in2.getName());
}
else {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d79dea92/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 2374931..e657d18 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -75,7 +75,7 @@ public class LibMatrixDNN {
enum TaskType {
ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, MaxPooling_Backward,
- LoopBasedConv2d, LoopedIm2ColConv2d, LoopBasedConv2dBwdFilter, LoopedIm2ColConv2dBwdFilter
+ LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData
}
public static class TemporaryConvolutionData {
@@ -99,8 +99,10 @@ public class LibMatrixDNN {
private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0);
private static AtomicLong loopedConvMatMultTime = new AtomicLong(0);
private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0);
- private static AtomicLong loopedConvBwdMatMultTime = new AtomicLong(0);
- private static AtomicLong loopedConvBwdIm2ColTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);
public static void appendStatistics(StringBuilder sb) {
if(DMLScript.STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
@@ -117,11 +119,13 @@ public class LibMatrixDNN {
+ im2colSparseCount.get() + "/"
+ maxPoolBwdSparseCount.get() + ".\n");
if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
- sb.append("LibMatrixDNN conv(im2col/matmult), bwdFil (im2col/matmult) time:\t" +
+ sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" +
- String.format("%.3f", loopedConvBwdIm2ColTime.get()*1e-9) + "/" +
- String.format("%.3f", loopedConvBwdMatMultTime.get()*1e-9) + " sec.\n");
+ String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
}
}
}
@@ -140,8 +144,10 @@ public class LibMatrixDNN {
loopedConvIm2ColTime.set(0);
loopedConvMatMultTime.set(0);
- loopedConvBwdMatMultTime.set(0);
- loopedConvBwdIm2ColTime.set(0);
+ loopedConvBwdFilterMatMultTime.set(0);
+ loopedConvBwdFilterIm2ColTime.set(0);
+ loopedConvBwdDataMatMultTime.set(0);
+ loopedConvBwdDataCol2ImTime.set(0);
}
public static class ConvolutionParameters {
@@ -173,6 +179,10 @@ public class LibMatrixDNN {
return false;
}
+ public String toString() {
+ return "(" + N + " " + C + " " + H + " " + W + " " + K + " " + R + " " + S + ")";
+ }
+
public ConvolutionParameters(long N, long C, long H, long W,
long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
this.N = convertToInt(N);
@@ -228,7 +238,44 @@ public class LibMatrixDNN {
}
}
- public static void conv2d_backward_filter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params, boolean useMemoryLessConvolution) throws DMLRuntimeException {
+ public static void conv2d_backward_data(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ params.input1 = filter;
+ params.input2 = dout;
+ params.output = outputBlock;
+ if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S ||
+ dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
+ throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
+ }
+ if(params.stride_h <= 0 || params.stride_w <= 0) {
+ throw new DMLRuntimeException("Only positive strides supported");
+ }
+
+ if(DMLScript.STATISTICS) {
+ if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
+ conv2dBwdDataSparseCount.addAndGet(1);
+ }
+ else {
+ conv2dBwdDataDenseCount.addAndGet(1);
+ }
+ }
+
+ params.reuseNonZeroedOutput = true;
+
+ int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+ if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
+ MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
+ dout_reshaped.allocateDenseBlock(true);
+ for (int n = 0; n < params.N; n++) {
+ doLoopedIm2ColConv2dBwdData(n, dout_reshaped, params);
+ }
+ }
+ else {
+ runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2dBwdData, params);
+ }
+ }
+
+ public static void conv2d_backward_filter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = dout;
params.output = outputBlock;
@@ -249,42 +296,21 @@ public class LibMatrixDNN {
}
}
- if(useMemoryLessConvolution && !useMemoryLessConvolution) {
- params.reuseNonZeroedOutput = true;
- }
+ params.reuseNonZeroedOutput = true;
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
warnSingleThreaded();
- if(useMemoryLessConvolution) {
- for (int c = 0; c < params.C; c++) {
- for (int k = 0; k < params.K; k++) {
- for (int r = 0; r < params.R; r++) {
- for (int s = 0; s < params.S; s++) {
- doConv2d_Backward_Filter(k, c, r, s, params);
- }
- }
- }
- }
- }
- else {
- MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
- im2ColOutBlock.allocateDenseBlock(true);
- MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
- dout_reshaped.allocateDenseBlock(true);
- for (int n = 0; n < params.N; n++) {
- params.output = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, dout_reshaped, params.output, params);
- }
+ MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+ im2ColOutBlock.allocateDenseBlock(true);
+ MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
+ dout_reshaped.allocateDenseBlock(true);
+ for (int n = 0; n < params.N; n++) {
+ params.output = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, dout_reshaped, params.output, params);
}
}
else {
- if(useMemoryLessConvolution) {
- runConvTask(constrainedNumThreads, params.K*params.C, params.R*params.S, TaskType.LoopBasedConv2dBwdFilter, params);
- }
- else {
- runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2dBwdFilter, params);
- }
-
+ runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2dBwdFilter, params);
}
}
@@ -341,6 +367,24 @@ public class LibMatrixDNN {
}
}
+ private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params) throws DMLRuntimeException {
+ MatrixBlock filter = params.input1;
+ MatrixBlock dout = params.input2;
+ doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
+ dout_reshaped.recomputeNonZeros();
+
+ MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
+ long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+ LibMatrixMult.matrixMult(dout_reshaped, filter, temp);
+ long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
+ doCol2imOverSingleImage(n, temp, params);
+ long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
+ if(DMLScript.STATISTICS) {
+ loopedConvBwdDataMatMultTime.addAndGet(t2-t1);
+ loopedConvBwdDataCol2ImTime.addAndGet(t3-t2);
+ }
+ }
+
private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n,
MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params) throws DMLRuntimeException {
long nnz = 0;
@@ -359,88 +403,14 @@ public class LibMatrixDNN {
LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp);
long t4 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
if(DMLScript.STATISTICS) {
- loopedConvBwdMatMultTime.addAndGet(t4-t3);
- loopedConvBwdIm2ColTime.addAndGet(t2-t1);
+ loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
+ loopedConvBwdFilterIm2ColTime.addAndGet(t2-t1);
}
elementWiseInPlaceTransposedAddition(partialRetBlock, temp);
return partialRetBlock;
}
- private static void doConv2d_Backward_Filter(int k, int c, int r, int s, ConvolutionParameters params) throws DMLRuntimeException {
- double [] inputArray = null;
- if (!params.input1.isInSparseFormat())
- inputArray = params.input1.getDenseBlock();
- double [] doutArray = null;
- if (!params.input2.isInSparseFormat())
- doutArray = params.input2.getDenseBlock();
- double [] outputArray = params.output.getDenseBlock();
-
- double outputVal = 0;
- if(inputArray == null && doutArray == null) {
- outputVal = doConv2d_Backward_Filter_SparseSparse(k, c, r, s, params);
- }
- else if(inputArray != null && doutArray == null) {
- outputVal = doConv2d_Backward_Filter_DenseSparse(k, c, r, s, params, inputArray);
- }
- else if(inputArray == null && doutArray != null) {
- outputVal = doConv2d_Backward_Filter_SparseDense(k, c, r, s, params, doutArray);
- }
- else {
- outputVal = doConv2d_Backward_Filter_DenseDense(k, c, r, s, params, inputArray, doutArray);
- }
-
- outputArray[k*params.C*params.R*params.S + c*params.R*params.S + r*params.S + s] = outputVal;
- }
-
- private static double doConv2d_Backward_Filter_SparseDense(int k, int c, int r, int s, ConvolutionParameters params, double [] doutArray) throws DMLRuntimeException {
- double outputVal = 0;
- // To ensure h >= 0 && h < params.H
- int pMin = (int) Math.max(0, Math.ceil(((double)(params.pad_h-r))/params.stride_h));
- int qMin = (int) Math.max(0, Math.ceil(((double)(params.pad_w-s))/params.stride_w));
- // To ensure w >= 0 && w < params.W
- int pMax = (int) Math.min(params.P, Math.ceil(((double)(params.H+params.pad_h-r))/params.stride_h));
- int qMax = (int) Math.min(params.Q, Math.ceil(((double)(params.W+params.pad_w-s))/params.stride_w));
-
- // TODO: Optimize this case
- for (int n = 0; n < params.N; n++) {
- int doutOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
- for (int p = pMin; p < pMax; p++) {
- for (int q = qMin; q < qMax; q++) {
- int h = p*params.stride_h + r - params.pad_h;
- int w = q*params.stride_w + s - params.pad_w;
- outputVal += doutArray[doutOffset + p*params.Q + q]*params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w);
- }
- }
- }
-
- return outputVal;
- }
-
- private static double doConv2d_Backward_Filter_DenseDense(int k, int c, int r, int s, ConvolutionParameters params, double [] inputArray, double [] doutArray) {
- double outputVal = 0;
- // To ensure h >= 0 && h < params.H
- int pMin = (int) Math.max(0, Math.ceil(((double)(params.pad_h-r))/params.stride_h));
- int qMin = (int) Math.max(0, Math.ceil(((double)(params.pad_w-s))/params.stride_w));
- // To ensure w >= 0 && w < params.W
- int pMax = (int) Math.min(params.P, Math.ceil(((double)(params.H+params.pad_h-r))/params.stride_h));
- int qMax = (int) Math.min(params.Q, Math.ceil(((double)(params.W+params.pad_w-s))/params.stride_w));
-
- for (int n = 0; n < params.N; n++) {
- int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W + s - params.pad_w;
- int doutOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
- for (int p = pMin; p < pMax; p++) {
- int h = p*params.stride_h + r - params.pad_h;
- for (int q = qMin; q < qMax; q++) {
- int w = q*params.stride_w;
- outputVal += doutArray[doutOffset + p*params.Q + q]*inputArray[inputOffset + h*params.W+w];
- }
- }
- }
-
- return outputVal;
- }
-
private static void computeTensorIndexes(int i, int j, int [] ret, int N, int C, int H, int W) throws DMLRuntimeException {
ret[0] = i;
ret[1] = j / (H*W);
@@ -448,56 +418,7 @@ public class LibMatrixDNN {
ret[3] = j % W;
}
- private static double doConv2d_Backward_Filter_DenseSparse(int k, int c, int r, int s, ConvolutionParameters params, double [] inputArray) throws DMLRuntimeException {
- MatrixBlock dout = params.input2;
- double outputVal = 0;
- Iterator<IJV> iter = dout.sparseBlock.getIterator();
- int [] tensorIndexes = new int[4];
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.K, params.P, params.Q);
- if(k == tensorIndexes[1]) {
- int n = tensorIndexes[0];
- int p = tensorIndexes[2];
- int q = tensorIndexes[3];
-
- double doutVal = ijv.getV();
- int h = p*params.stride_h + r - params.pad_h;
- int w = q*params.stride_w + s - params.pad_w;
- if(h >= 0 && h < params.H && w >= 0 && w < params.W) {
- outputVal += doutVal*inputArray[n*params.C*params.H*params.W + c*params.H*params.W + h*params.W+w];
- }
- }
- }
- return outputVal;
- }
-
- private static double doConv2d_Backward_Filter_SparseSparse(int k, int c, int r, int s, ConvolutionParameters params) throws DMLRuntimeException {
- MatrixBlock dout = params.input2;
- double outputVal = 0;
- Iterator<IJV> iter = dout.sparseBlock.getIterator();
- int [] tensorIndexes = new int[4];
-
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.K, params.P, params.Q);
- if(k == tensorIndexes[1]) {
- int n = tensorIndexes[0];
- int p = tensorIndexes[2];
- int q = tensorIndexes[3];
-
- double doutVal = ijv.getV();
- int h = p*params.stride_h + r - params.pad_h;
- int w = q*params.stride_w + s - params.pad_w;
- if(h >= 0 && h < params.H && w >= 0 && w < params.W) {
- outputVal += doutVal*params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w);
- }
- }
- }
- return outputVal;
- }
-
- public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params, boolean useMemoryLessConvolution) throws DMLRuntimeException {
+ public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = filter;
params.output = outputBlock;
@@ -516,36 +437,18 @@ public class LibMatrixDNN {
}
}
- if(useMemoryLessConvolution) {
- fillInTemporaryConvolutionData(input, params);
- }
- else
- params.reuseNonZeroedOutput = true;
-
+ params.reuseNonZeroedOutput = true;
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
-
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
warnSingleThreaded();
- if(useMemoryLessConvolution) {
- for (int n = 0; n < params.N; n++) {
- for (int k = 0; k < params.K; k++) {
- doLoopBasedConv2d(n, n+1, k, params);
- }
- }
- }
- else {
- MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
- im2ColOutBlock.allocateDenseBlock(true);
- for (int n = 0; n < params.N; n++) {
- doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
- }
+ MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+ im2ColOutBlock.allocateDenseBlock(true);
+ for (int n = 0; n < params.N; n++) {
+ doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
}
}
else {
- if(useMemoryLessConvolution)
- runConvTask(constrainedNumThreads, params.K, TaskType.LoopBasedConv2d, params);
- else
- runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2d, params);
+ runConvTask(constrainedNumThreads, 1, TaskType.LoopedIm2ColConv2d, params);
}
}
@@ -583,52 +486,6 @@ public class LibMatrixDNN {
}
- private static void fillInTemporaryConvolutionData(MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
- params.tmpData = new TemporaryConvolutionData();
- if(input.isInSparseFormat()) {
- params.tmpData.minIndexArrR = new int[params.H];
- params.tmpData.minIndexArrS = new int[params.W];
- for(int h = 0; h < params.H; h++) {
- for (int r = 0; r < params.R; r++) {
- // int h = p*params.stride_h + r - params.pad_h;
- if((h + params.pad_h - r) % params.stride_h == 0) {
- params.tmpData.minIndexArrR[h] = r;
- break;
- }
- }
- }
- for(int w = 0; w < params.W; w++) {
- for (int s = 0; s < params.S; s++) {
- // int h = p*params.stride_h + r - params.pad_h;
- if((w + params.pad_w - s) % params.stride_w == 0) {
- params.tmpData.minIndexArrS[w] = s;
- break;
- }
- }
- }
- }
- else {
- params.tmpData.minIndexArrR = new int[params.R];
- params.tmpData.maxIndexArrR = new int[params.R];
- params.tmpData.minIndexArrS = new int[params.S];
- params.tmpData.maxIndexArrS = new int[params.S];
- for (int r = 0; r < params.R; r++) {
- params.tmpData.minIndexArrR[r] = getMinPQ(params.pad_h, r, params.stride_h);
- params.tmpData.maxIndexArrR[r] = getMaxPQ(params.pad_h, r, params.stride_h, params.P, params.H);
- }
- for (int s = 0; s < params.S; s++) {
- params.tmpData.minIndexArrS[s] = getMinPQ(params.pad_w, s, params.stride_w);
- params.tmpData.maxIndexArrS[s] = getMaxPQ(params.pad_w, s, params.stride_w, params.Q, params.W);
- }
- params.tmpData.minCommonIndexS = params.tmpData.minIndexArrS[0];
- params.tmpData.maxCommonIndexS = params.tmpData.maxIndexArrS[0];
- for (int s = 1; s < params.S; s++) {
- params.tmpData.minCommonIndexS = Math.max(params.tmpData.minCommonIndexS, params.tmpData.minIndexArrS[s]);
- params.tmpData.maxCommonIndexS = Math.min(params.tmpData.maxCommonIndexS, params.tmpData.maxIndexArrS[s]);
- }
- }
- }
-
public static void maxpooling_backward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = dout;
@@ -665,236 +522,6 @@ public class LibMatrixDNN {
}
}
- private static void doLoopBasedConv2dDenseDense(int n1, int n2, int k, ConvolutionParameters params,
- double [] inputArray, double [] filterArray) {
- double [] outputArray = params.output.getDenseBlock();
- int [] minIndexArrR = params.tmpData.minIndexArrR;
- int [] maxIndexArrR = params.tmpData.maxIndexArrR;
- int [] minIndexArrS = params.tmpData.minIndexArrS;
- int [] maxIndexArrS = params.tmpData.maxIndexArrS;
-
- final int minCommonIndexS = params.tmpData.minCommonIndexS;
- final int maxCommonIndexS = params.tmpData.maxCommonIndexS;
-
- final int minS = (params.S >= 4) ? (params.S - params.S % 4) : 0;
-
- for (int n = n1; n < n2; n++) {
- for (int c = 0; c < params.C; c++) {
- for (int r = 0; r < params.R; r++) {
- final int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
- for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) {
- final int h = p*params.stride_h + r - params.pad_h;
- final int inputOffSet = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W - params.pad_w;
- final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q;
-
- for (int q = minCommonIndexS; q < maxCommonIndexS; q++) {
- final int wOffset = inputOffSet + q*params.stride_w;
- // ------------------------------------------------------------------------
- // Efficient striding with vectorization
- final int outOffsetWithQ = outputOffset + q;
- for (int s = 0; s < minS; s += 4) {
- final int inOffsetWithS = wOffset + s;
- final int filterOffsetWithS = filterOffset + s;
- outputArray[outOffsetWithQ] += inputArray[inOffsetWithS]*filterArray[filterOffsetWithS]
- + inputArray[inOffsetWithS+1]*filterArray[filterOffsetWithS+1]
- + inputArray[inOffsetWithS+2]*filterArray[filterOffsetWithS+2]
- + inputArray[inOffsetWithS+3]*filterArray[filterOffsetWithS+3];
- }
- // ------------------------------------------------------------------------
- // Efficient striding without vectorization
- for (int s = minS; s < params.S; s++) {
- outputArray[outputOffset + q] += inputArray[wOffset + s]*filterArray[filterOffset + s];
- }
- // ------------------------------------------------------------------------
- }
- // ------------------------------------------------------------------------
- // Inefficient striding
- for (int s = 0; s < params.S; s++) {
- for (int q = minIndexArrS[s]; q < minCommonIndexS; q++) {
- final int w = q*params.stride_w + s;
- outputArray[outputOffset + q] += inputArray[inputOffSet + w]*filterArray[filterOffset + s];
- }
- for (int q = maxCommonIndexS; q < maxIndexArrS[s]; q++) {
- final int w = q*params.stride_w + s;
- outputArray[outputOffset + q] += inputArray[inputOffSet + w]*filterArray[filterOffset + s];
- }
- }
- // ------------------------------------------------------------------------
- }
- }
- }
- }
- }
-
- private static void doLoopBasedConv2dDenseSparse(int n, int k, ConvolutionParameters params, double [] inputArray) throws DMLRuntimeException {
- double [] outputArray = params.output.getDenseBlock();
- int [] minIndexArrR = params.tmpData.minIndexArrR;
- int [] maxIndexArrR = params.tmpData.maxIndexArrR;
- int [] minIndexArrS = params.tmpData.minIndexArrS;
- int [] maxIndexArrS = params.tmpData.maxIndexArrS;
- final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
-
- Iterator<IJV> iter = params.input2.sparseBlock.getIterator(k, k+1);
- int [] tensorIndexes = new int[4];
-
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.K, params.C, params.R, params.S);
- int c = tensorIndexes[1];
- int r = tensorIndexes[2];
- int s = tensorIndexes[3];
- double filterVal = ijv.getV();
- final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W + s - params.pad_w;
- for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) {
- final int hOffset = inputOffset + (p*params.stride_h + r - params.pad_h)*params.W;
- final int pOffset = outputOffset + p*params.Q;
- for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
- final int w = q*params.stride_w;
- outputArray[pOffset + q] += inputArray[hOffset + w]*filterVal;
- }
- }
- }
- }
-
- private static void doLoopBasedConv2dSparseDense(int n, int k, ConvolutionParameters params, double [] filterArray) throws DMLRuntimeException {
- double [] outputArray = params.output.getDenseBlock();
- int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
-
- Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1);
- int [] tensorIndexes = new int[4];
-
- int [] minIndexArrR = params.tmpData.minIndexArrR;
- int [] minIndexArrS = params.tmpData.minIndexArrS;
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.H, params.W);
-
- int c = tensorIndexes[1];
- int h = tensorIndexes[2];
- int w = tensorIndexes[3];
- double imgVal = ijv.getV();
- for (int r = minIndexArrR[h]; r < params.R; r += params.stride_h) {
- int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
- for (int s = minIndexArrS[w]; s < params.S; s += params.stride_w) {
- int p = (int)Math.ceil(((double)(h + params.pad_h - r)) / params.stride_h);
- int q = (int)Math.ceil(((double)(w + params.pad_w - s)) / params.stride_w);
- if(p >= 0 && p < params.P && q >= 0 && q < params.Q) {
- double filterVal = filterArray[filterOffset + s];
- outputArray[outputOffset + p*params.Q + q] += imgVal*filterVal;
- }
- }
- }
- }
- }
-
- private static void doLoopBasedConv2dSparseSparse(int n, int k, ConvolutionParameters params) throws DMLRuntimeException {
- double [] outputArray = params.output.getDenseBlock();
- int [] minIndexArrR = params.tmpData.minIndexArrR;
- int [] maxIndexArrR = params.tmpData.maxIndexArrR;
- int [] minIndexArrS = params.tmpData.minIndexArrS;
- int [] maxIndexArrS = params.tmpData.maxIndexArrS;
- int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
-
-
- int [] tensorIndexesImage = new int[4];
- int [] tensorIndexesFilter = new int[4];
-
- Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1);
-
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexesImage, params.N, params.C, params.H, params.W);
- if(n == tensorIndexesImage[0]) {
- int c = tensorIndexesImage[1];
- int h = tensorIndexesImage[2];
- int w = tensorIndexesImage[3];
- double imgVal = ijv.getV();
-
- Iterator<IJV> iter1 = params.input2.sparseBlock.getIterator(k, k+1);
- while(iter1.hasNext()) {
- IJV ijv1 = iter1.next();
- computeTensorIndexes(ijv1.getI(), ijv1.getJ(), tensorIndexesFilter, params.K, params.C, params.R, params.S);
- if(c == tensorIndexesFilter[1]) {
- int r = tensorIndexesFilter[2];
- int s = tensorIndexesFilter[3];
- if((r-minIndexArrR[h])%params.stride_h == 0 && (s-minIndexArrS[w])%params.stride_w == 0) {
- int p = (int)Math.ceil(((double)(h + params.pad_h - r)) / params.stride_h);
- int q = (int)Math.ceil(((double)(w + params.pad_w - s)) / params.stride_w);
- if(p >= 0 && p < params.P && q >= 0 && q < params.Q) {
- double filterVal = ijv1.getV();
- outputArray[outputOffset + p*params.Q + q] += imgVal*filterVal;
- }
- }
- }
- }
- }
- }
-
- while(iter.hasNext()) {
- IJV ijv = iter.next();
- computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexesFilter, params.K, params.C, params.R, params.S);
- if(k == tensorIndexesFilter[0]) {
- int c = tensorIndexesFilter[1];
- int r = tensorIndexesFilter[2];
- int s = tensorIndexesFilter[3];
- double filterVal = ijv.getV();
- for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) {
- int h = p*params.stride_h + r - params.pad_h;
- for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
- int w = q*params.stride_w + s - params.pad_w;
- // TODO: Improve the performance of sparse sparse
- outputArray[outputOffset + p*params.Q + q] += sparseConvMultiply(filterVal, params, n, c, h, w);
- }
- }
- }
- }
- }
-
- /**
- * This is essentially memory-less operation and can be used when the memory pressure is extremely high.
- * @param n
- * @param k
- * @param params
- * @throws DMLRuntimeException
- */
- private static void doLoopBasedConv2d(int n1, int n2, int k, ConvolutionParameters params) throws DMLRuntimeException {
- double [] inputArray = null;
- if (!params.input1.isInSparseFormat())
- inputArray = params.input1.getDenseBlock();
- double [] filterArray = null;
- if (!params.input2.isInSparseFormat())
- filterArray = params.input2.getDenseBlock();
-
- if(inputArray != null && filterArray != null) {
- doLoopBasedConv2dDenseDense(n1, n2, k, params, inputArray, filterArray);
- }
- else if(inputArray != null && filterArray == null) {
- for (int n = n1; n < n2; n++)
- doLoopBasedConv2dDenseSparse(n, k, params, inputArray);
- }
- else if(inputArray == null && filterArray != null) {
- for (int n = n1; n < n2; n++)
- doLoopBasedConv2dSparseDense(n, k, params, filterArray);
- }
- else if(inputArray == null && filterArray == null) {
- for (int n = n1; n < n2; n++)
- doLoopBasedConv2dSparseSparse(n, k, params);
- }
- }
-
- private static int getMinPQ(int pad, int filterSize, int stride) {
- return Math.max(0, (int)Math.ceil(((double)(pad - filterSize))/stride));
- }
-
- private static int getMaxPQ(int pad, int filterSize, int stride, int outputSize, int inputSize) {
- return Math.min(outputSize, (int)Math.ceil(((double)(inputSize + pad - filterSize)) / stride));
- }
-
- private static double sparseConvMultiply(double filterVal, ConvolutionParameters params,
- int n, int c, int h, int w) {
- return params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w)*filterVal;
- }
-
private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
double [] inputArray = null;
if (!params.input1.isInSparseFormat())
@@ -1251,13 +878,6 @@ public class LibMatrixDNN {
}
}
- private static void runConvTask(int constrainedNumThreads, int NSize, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
- if (params.isOutputThreadSafe() && constrainedNumThreads > 1)
- runParallelConvTask(constrainedNumThreads, NSize, Z, type, params);
- else
- runSequentialConvTask(NSize, Z, type, params);
- }
-
private static void runConvTask(int constrainedNumThreads, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
if (params.isOutputThreadSafe() && constrainedNumThreads > 1)
runParallelConvTask(constrainedNumThreads, params.N, Z, type, params);
@@ -1267,13 +887,21 @@ public class LibMatrixDNN {
private static void runParallelConvTask(int constrainedNumThreads, int NSize, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
ArrayList<ConvTask> tasks = new ArrayList<ConvTask>();
- int [] taskSizes = getTaskSize(constrainedNumThreads, NSize, Z);
- for (int n = 0; n < NSize; n += taskSizes[0]) {
- for (int z = 0; z < Z; z += taskSizes[1]) {
- tasks.add(new ConvTask(n, Math.min(NSize, n+taskSizes[0]), z, Math.min(Z, z+taskSizes[1]), type, params));
+ if(NSize >= constrainedNumThreads || Z == 1) {
+ int numNTasks = (int) Math.ceil(((double) NSize) / constrainedNumThreads);
+ for (int n = 0; n < NSize; n += numNTasks) {
+ tasks.add(new ConvTask(n, Math.min(NSize, n+numNTasks), 0, Z, type, params));
+ }
+ }
+ else {
+ int [] taskSizes = getTaskSize(constrainedNumThreads, NSize, Z);
+ for (int n = 0; n < NSize; n += taskSizes[0]) {
+ for (int z = 0; z < Z; z += taskSizes[1]) {
+ tasks.add(new ConvTask(n, Math.min(NSize, n+taskSizes[0]), z, Math.min(Z, z+taskSizes[1]), type, params));
+ }
}
+ LOG.debug("Reduce number of tasks from " + (NSize*Z) + "(" + NSize + "," + Z + ") to " + tasks.size());
}
- LOG.debug("Reduce number of tasks from " + (NSize*Z) + "(" + NSize + "," + Z + ") to " + tasks.size());
ExecutorService pool = Executors.newFixedThreadPool( Math.min(constrainedNumThreads, tasks.size()) );
List<Future<Object>> taskret;
@@ -1314,19 +942,19 @@ public class LibMatrixDNN {
switch(type) {
case ReshapeCol:
for (int n = n1; n < n2; n++) {
- LibMatrixDNN.doReshapeCol(n, params);
+ doReshapeCol(n, params);
}
break;
case Rotate180:
for (int n = n1; n < n2; n++) {
- LibMatrixDNN.doRotate180(n, params);
+ doRotate180(n, params);
}
break;
case Im2Col:
long nnz = 0;
for (int n = n1; n < n2; n++) {
for (int z = z1; z < z2; z++) {
- nnz += LibMatrixDNN.doIm2colOverInputPath_NCHW(n, z, params);
+ nnz += doIm2colOverInputPath_NCHW(n, z, params);
}
}
params.outputNNZ.addAndGet(nnz);
@@ -1334,46 +962,31 @@ public class LibMatrixDNN {
case Col2Im:
for (int n = n1; n < n2; n++) {
for (int z = z1; z < z2; z++) {
- LibMatrixDNN.doCol2imOverInputPath_NCHW(n, z, params);
+ doCol2imOverInputPath_NCHW(n, z, params);
}
}
break;
case MaxPooling_Forward:
for (int n = n1; n < n2; n++) {
for (int z = z1; z < z2; z++) {
- LibMatrixDNN.doPooling(n, z, params);
+ doPooling(n, z, params);
}
}
break;
case MaxPooling_Backward:
for (int n = n1; n < n2; n++) {
- LibMatrixDNN.doPoolingBackward(n, params);
- }
- break;
- case LoopBasedConv2d:
- for (int z = z1; z < z2; z++) {
- LibMatrixDNN.doLoopBasedConv2d(n1, n2, z, params);
+ doPoolingBackward(n, params);
}
break;
case LoopedIm2ColConv2d:
MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
im2ColOutBlock.allocateDenseBlock(true);
for (int n = n1; n < n2; n++) {
- LibMatrixDNN.doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
- }
- break;
- case LoopBasedConv2dBwdFilter:
- for (int x = n1; x < n2; x++) {
- int k = x / params.C;
- int c = x % params.C;
- for (int y = z1; y < z2; y++) {
- int r = y / params.S;
- int s = y % params.S;
- doConv2d_Backward_Filter(k, c, r, s, params);
- }
+ doLoopedIm2ColConv2d(n, im2ColOutBlock, params);
}
break;
case LoopedIm2ColConv2dBwdFilter:
+ {
MatrixBlock im2ColOutBlock1 = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
im2ColOutBlock1.allocateDenseBlock(true);
MatrixBlock partialRetBlock = new MatrixBlock(params.K, params.C*params.R*params.S, false);
@@ -1381,9 +994,19 @@ public class LibMatrixDNN {
MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
dout_reshaped.allocateDenseBlock(true);
for (int n = n1; n < n2; n++) {
- partialRetBlock = LibMatrixDNN.doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock1, dout_reshaped, partialRetBlock, params);
+ partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock1, dout_reshaped, partialRetBlock, params);
}
return partialRetBlock;
+ }
+ case LoopedIm2ColConv2dBwdData:
+ {
+ MatrixBlock dout_reshaped = new MatrixBlock(params.P*params.Q, params.K, false);
+ dout_reshaped.allocateDenseBlock(true);
+ for (int n = n1; n < n2; n++) {
+ doLoopedIm2ColConv2dBwdData(n, dout_reshaped, params);
+ }
+ break;
+ }
default:
throw new DMLRuntimeException("Unsupported ConvTask:" + type.name());
}
@@ -1470,6 +1093,67 @@ public class LibMatrixDNN {
}
}
+
+ // Converts input: PQ X CRS matrix and writes to 1 X CHW
+ private static void doCol2imOverSingleImage(int n, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
+ if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) {
+ throw new DMLRuntimeException("Incorrect input dimensions");
+ }
+
+ double [] outputArray = null;
+ if (!params.output.isInSparseFormat())
+ outputArray = params.output.getDenseBlock();
+ else {
+ throw new DMLRuntimeException("Only dense output is implemented");
+ }
+
+ if(!input.isInSparseFormat()) {
+ double [] inputArray = input.getDenseBlock();
+ doCol2IMDenseInput(n, inputArray, outputArray, params);
+ }
+ else {
+ doCol2IMSparseInput(n, input.getSparseBlockIterator(), outputArray, params);
+ }
+ }
+
+ private static void doCol2IMSparseInput(int n, Iterator<IJV> inputIter, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ int [] tensorIndexes = new int[4];
+ while(inputIter.hasNext()) {
+ IJV ijv = inputIter.next();
+ computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.P*params.Q, params.C, params.R, params.S);
+ int c = tensorIndexes[1];
+ int r = tensorIndexes[2];
+ int s = tensorIndexes[3];
+ int p = ijv.getI() / params.Q;
+ int q = ijv.getI() % params.Q;
+ int h = p*params.stride_h + r - params.pad_h;
+ int w = q*params.stride_w + s - params.pad_w;
+ if(h >= 0 && h < params.H && w >= 0 && w < params.W) {
+ int outIndex = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w;
+ outputArray[outIndex] += ijv.getV();
+ }
+ }
+ }
+
+ private static void doCol2IMDenseInput(int n, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ for (int c = 0; c < params.C; c++) {
+ for (int r = 0; r < params.R; r++) { // Get an input patch of size R X S
+ for (int s = 0; s < params.S; s++) {
+ for (int p = 0; p < params.P; p++) {
+ for (int q = 0; q < params.Q; q++) {
+ int inputIndex = (p*params.Q + q)*params.C*params.R*params.S + c*params.R*params.S + r*params.S + s;
+ int h = p*params.stride_h + r - params.pad_h;
+ int w = q*params.stride_w + s - params.pad_w;
+ if(h >= 0 && h < params.H && w >= 0 && w < params.W) {
+ int outIndex = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w;
+ outputArray[outIndex] += inputArray[inputIndex];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
private static void doCol2imOverInputPath_NCHW(int n, int c, ConvolutionParameters params) {
double [] inputArray = null;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d79dea92/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index db244ff..7a83278 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -190,45 +190,42 @@ public class ConvolutionUtils {
}
public static Lop constructConvolutionBackwardDataLops(Hop currentHop, ExecType et) throws HopsException, LopsException {
- return null; // Until we add CP conv2d_backward_data
+ if(DMLScript.USE_ACCELERATOR)
+ et = ExecType.GPU; // TODO: Add memory estimate checks
+ else if(et == ExecType.MR || et == ExecType.SPARK)
+ return null;
- //TODO: uncomment the following after CP conv2d_backward_data is added
-// if(DMLScript.USE_ACCELERATOR)
-// et = ExecType.GPU; // TODO: Add memory estimate checks
-// else
-// return null;
-//
-// if(currentHop != null && isConvolutionOp(currentHop, ConvOp.COL2IM)) {
-// Hop temp = currentHop.getInput().get(0);
-// if(temp != null && isTranspose(temp)) {
-// Hop matMult = temp.getInput().get(0);
-// if(matMult != null && isMatMult(matMult)) {
-// Hop rotate180 = matMult.getInput().get(0);
-// Hop filter = matMult.getInput().get(1);
-// if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) {
-// ArrayList<Hop> inputs = new ArrayList<Hop>();
-// inputs.add(filter);
-// inputs.add(rotate180.getInput().get(0));
-// for(int i = 1; i < rotate180.getInput().size(); i++) {
-// inputs.add(rotate180.getInput().get(i));
-// }
-//
-// // N, C * H * W
-// long N = currentHop.computeSizeInformation(inputs.get(6));
-// long C = currentHop.computeSizeInformation(inputs.get(7));
-// long H = currentHop.computeSizeInformation(inputs.get(8));
-// long W = currentHop.computeSizeInformation(inputs.get(9));
-// long rlen = N;
-// long clen = ConvolutionOp.getExtractedVal(C, H, W);
-// return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen);
-//
-//
-// }
-// }
-// }
-// }
-//
-// return null;
+ if(currentHop != null && isConvolutionOp(currentHop, ConvOp.COL2IM)) {
+ Hop temp = currentHop.getInput().get(0);
+ if(temp != null && isTranspose(temp)) {
+ Hop matMult = temp.getInput().get(0);
+ if(matMult != null && isMatMult(matMult)) {
+ Hop rotate180 = matMult.getInput().get(0);
+ Hop filter = matMult.getInput().get(1);
+ if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) {
+ ArrayList<Hop> inputs = new ArrayList<Hop>();
+ inputs.add(filter);
+ inputs.add(rotate180.getInput().get(0));
+ for(int i = 1; i < rotate180.getInput().size(); i++) {
+ inputs.add(rotate180.getInput().get(i));
+ }
+
+ // N, C * H * W
+ long N = currentHop.computeSizeInformation(inputs.get(6));
+ long C = currentHop.computeSizeInformation(inputs.get(7));
+ long H = currentHop.computeSizeInformation(inputs.get(8));
+ long W = currentHop.computeSizeInformation(inputs.get(9));
+ long rlen = N;
+ long clen = ConvolutionOp.getExtractedVal(C, H, W);
+ return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen);
+
+
+ }
+ }
+ }
+ }
+
+ return null;
}