You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/04/30 18:16:57 UTC
[2/3] incubator-systemml git commit: [SYSTEMML-769] Adding support
for native BLAS for Linux
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f839990..c794932 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,15 +21,20 @@ package org.apache.sysml.runtime.instructions.cp;
import java.util.ArrayList;
import java.util.Arrays;
+
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
public class ConvolutionCPInstruction extends UnaryCPInstruction
{
@@ -131,7 +136,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
return new ConvolutionCPInstruction(in, out, opcode, str, stride,
padding, input_shape, filter_shape, k);
}
- else if (opcode.equalsIgnoreCase("maxpooling_backward")
+ else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward")
|| opcode.equalsIgnoreCase("conv2d")
|| opcode.equalsIgnoreCase("conv2d_backward_filter")
|| opcode.equalsIgnoreCase("conv2d_backward_data")) {
@@ -288,6 +293,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
ec.setMatrixOutput(getOutputVariableName(), outputBlock);
}
+ // Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true
+ // This increases the number of native calls. For example:the cases where filter is sparse but input is dense
+ private boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException {
+ long numElems = filter.getNumRows()*filter.getNumColumns();
+ // if filter is less than 10 MB in dense format (which handles almost all the cases).
+ // In fact, using threshold of 1 MB is still sufficient for common CNNs.
+ if(filter.isInSparseFormat() && numElems < 10e+6)
+ filter.sparseToDense();
+ return filter.isInSparseFormat();
+ }
+
@Override
public void processInstruction(ExecutionContext ec)
@@ -326,6 +342,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
+ params.enableNative = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS) && NativeHelper.isNativeLibraryLoaded();
if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
if(matBlock.isEmptyBlock()) {
outputBlock = new MatrixBlock(N, C*P*Q, true);
@@ -337,14 +354,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
}
}
- else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
+ else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
MatrixBlock dout = ec.getMatrixInput(_in2.getName());
if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) {
outputBlock = new MatrixBlock(N, C*H*W, true);
}
else {
outputBlock = getDenseOutputBlock(N, C*H*W);
- LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params);
+ if(instOpcode.equalsIgnoreCase("maxpooling_backward"))
+ LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false);
+ else
+ LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true);
}
ec.releaseMatrixInput(_in2.getName());
}
@@ -355,7 +375,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
}
else {
outputBlock = getDenseOutputBlock(N, K*P*Q);
- LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+ if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+ LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+ else
+ LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
}
ec.releaseMatrixInput(_in2.getName());
}
@@ -367,9 +390,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
}
else {
outputBlock = getDenseOutputBlock(N, K*P*Q);
- if(!bias.isEmptyBlock())
+ if(!bias.isEmptyBlock()) {
params.bias = bias;
- LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+ }
+ if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat())
+ LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+ else
+ LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
}
ec.releaseMatrixInput(_in3.getName());
ec.releaseMatrixInput(_in2.getName());
@@ -381,7 +408,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
}
else {
outputBlock = getDenseOutputBlock(K, C*R*S);
- LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+ if(params.enableNative && !matBlock.isInSparseFormat() && !dout.isInSparseFormat())
+ LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+ else
+ LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
}
ec.releaseMatrixInput(_in2.getName());
}
@@ -392,7 +422,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction
}
else {
outputBlock = getDenseOutputBlock(N, C * H * W);
- LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
+ if(params.enableNative && !isFilterSparse(matBlock) && !dout.isInSparseFormat())
+ LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
+ else
+ LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
}
ec.releaseMatrixInput(_in2.getName());
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 497b182..9d4cd1f 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -141,7 +141,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride,
padding, input_shape, filter_shape);
}
- else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) {
+ else if (opcode.equalsIgnoreCase("maxpooling")) {
InstructionUtils.checkNumFields(parts, 14);
CPOperand in1 = new CPOperand(parts[1]);
CPOperand out = new CPOperand(parts[14]);
@@ -303,7 +303,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
- else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+ else if (instOpcode.equalsIgnoreCase("maxpooling")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
if(image.getNumRows() != N || image.getNumColumns() != C*H*W)
@@ -315,9 +315,6 @@ public class ConvolutionGPUInstruction extends GPUInstruction
if(instOpcode.equalsIgnoreCase("maxpooling"))
LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
- else
- LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
- K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -341,7 +338,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
// release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
- if (!( instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) )
+ if ( !instOpcode.equalsIgnoreCase("maxpooling") )
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
index b485233..f687cc0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
@@ -25,6 +25,8 @@ import java.util.Iterator;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
@@ -41,6 +43,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -281,7 +284,8 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
ConvolutionParameters params = new ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
- JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows()), true);
+ boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+ JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
@@ -316,15 +320,16 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
Broadcast<MatrixBlock> filterBroadcast = null;
Broadcast<MatrixBlock> biasBroadcast = null;
ConvolutionParameters params = null;
- String instOpcode = null;
+ String instOpcode = null; boolean enableNative;
long numRows = 0;
public RDDConv2dMapMMFunction(Broadcast<MatrixBlock> filterBroadcast,
- ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows) {
+ ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows, boolean enableNativeBLAS) {
this.filterBroadcast = filterBroadcast;
this.params = params;
this.instOpcode = instOpcode;
this.biasBroadcast = biasBroadcast;
this.numRows = numRows;
+ this.enableNative = enableNativeBLAS;
}
private MatrixBlock processRectangularBlock(MatrixBlock matBlock) throws Exception {
@@ -336,7 +341,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
}
else {
outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
- LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+ if(enableNative)
+ LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+ else
+ LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
}
}
else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -349,7 +357,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction {
outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q);
if(!bias.isEmptyBlock())
params.bias = bias;
- LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
+ if(enableNative)
+ LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+ else
+ LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params);
}
}
else if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 3f0437f..11e74ca 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -34,7 +34,7 @@ public class ConvolutionParameters implements Serializable {
public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
public int P; public int Q; public int numThreads;
-
+ public boolean enableNative = false;
public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output;
public MatrixBlock bias;
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 9ddd87a..56360f8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1043,41 +1043,6 @@ public class LibMatrixCUDA {
}
/**
- * performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...)
- * @param gCtx a valid {@link GPUContext}
- * @param instName the invoking instruction's name for record {@link Statistics}.
- * @param image image as matrix object
- * @param outputBlock output matrix
- * @param N batch size
- * @param C number of channels
- * @param H height of image
- * @param W width of image
- * @param K number of filters
- * @param R height of filter
- * @param S width of filter
- * @param pad_h vertical padding
- * @param pad_w horizontal padding
- * @param stride_h horizontal stride
- * @param stride_w vertical stride
- * @param P (H - R + 1 + 2*pad_h)/stride_h
- * @param Q (W - S + 1 + 2*pad_w)/stride_w
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public static void reluMaxpooling(GPUContext gCtx, String instName, MatrixObject image,
- MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
- int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
- int Q) throws DMLRuntimeException {
- LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
- cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W);
- long size = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE;
- Pointer tmp = gCtx.allocate(size);
- performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
- //cudaDeviceSynchronize; // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
- performMaxpooling(gCtx, instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
- gCtx.cudaFreeHelper(tmp);
- }
-
- /**
* Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
* This method computes the backpropogation errors for previous layer of maxpooling operation
* @param gCtx a valid {@link GPUContext}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index cd88b95..6a43917 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -36,6 +36,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
/**
* This class allows users to invoke deep learning related operations
@@ -57,10 +59,10 @@ public class LibMatrixDNN {
public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output
private enum TaskType {
- MaxPooling_Forward, MaxPooling_Backward,
+ MaxPooling_Forward, MaxPooling_Backward, MaxPooling_Relu_Backward,
// Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData,
- BiasAdd, ReluBackward, BiasMultiply
+ ReluBackward
}
// ------------------------------------------------------------------------------------------------
@@ -141,6 +143,30 @@ public class LibMatrixDNN {
// ------------------------------------------------------------------------------------------------
/**
+ * This method performs convolution (i.e. cross-correlation) operation on input
+ *
+ * @param input input batch
+ * @param filter filter
+ * @param outputBlock output of convolution
+ * @param params convolution parameters
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+
+ if(params.bias != null && params.bias.isInSparseFormat())
+ params.bias.sparseToDense(); // Since bias is extremely small array
+
+ if(isEligibleForConv2dSparse(params))
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+
+ runConvTask(TaskType.LoopedIm2ColConv2d, params);
+
+ //post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ }
+
+ /**
* This method computes the backpropogation errors for previous layer of convolution operation
*
* @param filter filter used in conv2d
@@ -150,25 +176,10 @@ public class LibMatrixDNN {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
- params.input1 = filter;
- params.input2 = dout;
- params.output = outputBlock;
- if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S ||
- dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
- throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
- }
- if(params.stride_h <= 0 || params.stride_w <= 0) {
- throw new DMLRuntimeException("Only positive strides supported");
- }
+ checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
- if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
- if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
- conv2dBwdDataSparseCount.addAndGet(1);
- }
- else {
- conv2dBwdDataDenseCount.addAndGet(1);
- }
- }
+ if(isEligibleForConv2dBackwardDataDense(params))
+ Statistics.numNativeLibMatrixDNNCalls.increment();
runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
@@ -186,17 +197,71 @@ public class LibMatrixDNN {
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
- params.input1 = input;
+ checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+
+ if(isEligibleForConv2dBackwardFilterSparseDense(params))
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+
+ runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
+
+ //post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ }
+
+
+ private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException {
+ if(lhs != rhs)
+ throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs);
+ }
+ private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
+ if(lhs != (rhs1*rhs2*rhs3))
+ throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3);
+ }
+
+ static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ params.input1 = filter;
params.input2 = dout;
params.output = outputBlock;
- if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
- dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
- throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
- }
- if(params.stride_h <= 0 || params.stride_w <= 0) {
- throw new DMLRuntimeException("Only positive strides supported");
+ checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != "
+ + "number of filters in filter_shape", filter.getNumRows(), params.K);
+ checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != "
+ + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+ checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != "
+ + "batch size in input_shape", dout.getNumRows(), params.N);
+ checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != "
+ + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+ if(params.stride_h <= 0 || params.stride_w <= 0)
+ throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+
+ if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+ if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
+ conv2dBwdDataSparseCount.addAndGet(1);
+ }
+ else {
+ conv2dBwdDataDenseCount.addAndGet(1);
+ }
}
+ int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+ if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1))
+ params.numThreads = 1;
+ }
+
+ static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ params.input1 = input;
+ params.input2 = dout;
+ params.output = outputBlock;
+ checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != "
+ + "batch size in input_shape", input.getNumRows(), params.N);
+ checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != "
+ + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+ checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != "
+ + "batch size in input_shape", dout.getNumRows(), params.N);
+ checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != "
+ + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q);
+ if(params.stride_h <= 0 || params.stride_w <= 0)
+ throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
+
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(input.isInSparseFormat() || dout.isInSparseFormat()) {
conv2dBwdFilterSparseCount.addAndGet(1);
@@ -205,11 +270,6 @@ public class LibMatrixDNN {
conv2dBwdFilterDenseCount.addAndGet(1);
}
}
-
- runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
-
- //post-processing: maintain nnz
- outputBlock.recomputeNonZeros();
}
/**
@@ -252,11 +312,10 @@ public class LibMatrixDNN {
MatrixBlock filter = params.input1;
MatrixBlock dout = params.input2;
doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
- dout_reshaped.recomputeNonZeros();
MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
- LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
+ singleThreadedMatMult(dout_reshaped, filter, temp, true, false, params);
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
doCol2imOverSingleImage(n, temp, params);
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
@@ -270,15 +329,13 @@ public class LibMatrixDNN {
MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params, double [] tempIm2ColArr) throws DMLRuntimeException {
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
doIm2col(n, im2ColOutBlock, params, tempIm2ColArr);
- im2ColOutBlock.recomputeNonZeros();
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
- dout_reshaped.recomputeNonZeros();
MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
- LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
+ singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, params);
long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
@@ -298,15 +355,21 @@ public class LibMatrixDNN {
ret[2] = j % W;
}
- public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = filter;
params.output = outputBlock;
- if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
- filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) {
- throw new DMLRuntimeException("Incorrect input to conv2d: " + input.getNumRows());
- }
+ checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != "
+ + "number of filters in filter_shape", filter.getNumRows(), params.K);
+ checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != "
+ + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+ checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != "
+ + "batch size in input_shape", input.getNumRows(), params.N);
+ checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != "
+ + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W);
+ if(params.stride_h <= 0 || params.stride_w <= 0)
+ throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w);
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(input.isInSparseFormat() || filter.isInSparseFormat()) {
@@ -316,21 +379,36 @@ public class LibMatrixDNN {
conv2dDenseCount.addAndGet(1);
}
}
-
- runConvTask(TaskType.LoopedIm2ColConv2d, params);
-
- //post-processing: maintain nnz
- outputBlock.recomputeNonZeros();
+ }
+
+ // Single-threaded matrix multiplication
+ private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
+ boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException {
+ if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) {
+ if(recomputeNNZM1)
+ m1.recomputeNonZeros();
+ if(recomputeNNZM2)
+ m2.recomputeNonZeros();
+ LibMatrixMult.matrixMult(m1, m2, ret, false);
+ }
+ else {
+ ret.sparse = false;
+ if(ret.getDenseBlock() == null)
+ ret.allocateDenseBlock();
+ NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock,
+ ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1);
+ ret.recomputeNonZeros();
+ }
}
private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params, double [] temp) throws DMLRuntimeException {
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
doIm2col(n, im2ColOutBlock, params, temp);
- im2ColOutBlock.recomputeNonZeros();
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
- LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
+ singleThreadedMatMult(params.input2, im2ColOutBlock, matMultOutBlock, false, true, params);
+
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
@@ -380,9 +458,11 @@ public class LibMatrixDNN {
* @param dout dout matrix
* @param outputBlock output matrix
* @param params convolution parameters
+ * @param performReluBackward perform ReLU backward
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock,
+ ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
params.input1 = input;
params.input2 = dout;
params.output = outputBlock;
@@ -407,7 +487,10 @@ public class LibMatrixDNN {
throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");
fillIndexesArray(params);
- runConvTask(TaskType.MaxPooling_Backward, params);
+ if(performReluBackward)
+ runConvTask(TaskType.MaxPooling_Relu_Backward, params);
+ else
+ runConvTask(TaskType.MaxPooling_Backward, params);
//post-processing: maintain nnz
outputBlock.recomputeNonZeros();
@@ -440,7 +523,7 @@ public class LibMatrixDNN {
}
}
- private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
+ private static void doPoolingBackward(int n, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
double [] inputArray = null;
if (!params.input1.isInSparseFormat())
inputArray = params.input1.getDenseBlock();
@@ -455,19 +538,19 @@ public class LibMatrixDNN {
if(inputArray != null) {
if(doutArray != null)
- doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
+ doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params, performReluBackward);
else
- doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
+ doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params, performReluBackward);
}
else {
if(doutArray != null)
- doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
+ doPoolingBackwardSparseDense(n, doutArray, outputArray, params, performReluBackward);
else
- doPoolingBackwardSparseSparse(n, outputArray, params);
+ doPoolingBackwardSparseSparse(n, outputArray, params, performReluBackward);
}
}
- private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
if (!params.input1.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
@@ -477,7 +560,7 @@ public class LibMatrixDNN {
double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q];
if(inVal != 0) {
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
- int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+ int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
if(maxIndex != -1)
outputArray[maxIndex] += inVal;
}
@@ -486,7 +569,7 @@ public class LibMatrixDNN {
}
}
- private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
if (!params.input1.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
@@ -502,7 +585,7 @@ public class LibMatrixDNN {
int p = tensorIndexes[1];
int q = tensorIndexes[2];
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
- int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+ int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward);
if(maxIndex != -1)
outputArray[maxIndex] += avals[j];
}
@@ -510,7 +593,7 @@ public class LibMatrixDNN {
}
private static void doPoolingBackwardDenseSparse(int n, double [] inputArray,
- MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ MatrixBlock dout, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
if( !dout.sparseBlock.isEmpty(n) ) {
int [] tensorIndexes = new int[3];
int apos = dout.sparseBlock.pos(n);
@@ -523,7 +606,7 @@ public class LibMatrixDNN {
int p = tensorIndexes[1];
int q = tensorIndexes[2];
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
- int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+ int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
if(maxIndex != -1)
outputArray[maxIndex] += avals[j];
}
@@ -531,14 +614,14 @@ public class LibMatrixDNN {
}
private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray,
- double [] outputArray, ConvolutionParameters params) {
+ double [] outputArray, ConvolutionParameters params, boolean performReluBackward) {
for (int c = 0; c < params.C; c++) {
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q;
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++) {
- int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
+ int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward);
if(maxIndex != -1)
outputArray[maxIndex] += doutArray[outputOffset + p * params.Q + q];
}
@@ -556,10 +639,11 @@ public class LibMatrixDNN {
* @param c number of channels
* @param input input matrix
* @param params convolution parameters
+ * @param performReluBackward perform ReLU on input
* @return index of the cell with maximum value
* @throws DMLRuntimeException if error occurs
*/
- private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
+ private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException {
if(!input.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");
@@ -591,9 +675,10 @@ public class LibMatrixDNN {
int h = tensorIndexes[1];
int w = tensorIndexes[2];
if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
- if(maxVal < avals[j]) {
+ double val = performReluBackward && avals[j] < 0 ? 0 : avals[j];
+ if(maxVal < val) {
maxIndex = inputOffset + h*params.W + w;
- maxVal = avals[j];
+ maxVal = val;
}
}
}
@@ -612,9 +697,10 @@ public class LibMatrixDNN {
* @param inputOffset offset to be used for input index
* @param inputArray input array
* @param params convolution parameters
+ * @param performReluBackward perform ReLU backward
* @return index of cell with maximum value
*/
- private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
+ private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) {
int start_index_h = params.start_indexes_h[p];
int end_index_h = params.end_indexes_h[p];
int start_index_w = params.start_indexes_w[q];
@@ -632,6 +718,7 @@ public class LibMatrixDNN {
for (int h = start_index_h; h < end_index_h; h++) {
for (int w = start_index_w; w < end_index_w; w++) {
currDoutVal = inputArray[inputOffset + h*params.W + w];
+ currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal;
if(maxVal < currDoutVal) {
maxIndex = inputOffset + h*params.W + w;
maxVal = currDoutVal;
@@ -894,11 +981,15 @@ public class LibMatrixDNN {
private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params,
ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
+ boolean isEligibleForConv2dSparse = (type == TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params);
+ boolean isEligibleForConv2dBackwardFilterSparseDense = (type == TaskType.LoopedIm2ColConv2dBwdFilter) && isEligibleForConv2dBackwardFilterSparseDense(params) ;
for(int i = 0; i < poolSize; i++) {
if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
- MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
- im2ColOutBlock.allocateDenseBlock();
- im2ColOutBlocks.add(im2ColOutBlock);
+ if(!isEligibleForConv2dSparse && !isEligibleForConv2dBackwardFilterSparseDense) {
+ MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+ im2ColOutBlock.allocateDenseBlock();
+ im2ColOutBlocks.add(im2ColOutBlock);
+ }
}
if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
@@ -962,6 +1053,21 @@ public class LibMatrixDNN {
}
// ----------------------------------------------------------------------------------------------------------------
+ private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
+ // NativeHelper.conv2dBackwardFilterSparseDense only if filter is sparse.
+ // dout converted to dense if sparse.
+ return params.enableNative && params.input1.isInSparseFormat();
+ }
+ private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) {
+ // NativeHelper.conv2dSparse only if filter is dense and input is sparse
+ return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat();
+ }
+ private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
+ // NativeHelper.conv2dBackwardDataDense only if filter is dense.
+ // dout converted to dense if sparse.
+ return params.enableNative && !params.input1.isInSparseFormat();
+ }
+
/**
* The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc)
* to be executed in multi-thread manner.
@@ -1001,55 +1107,108 @@ public class LibMatrixDNN {
break;
case MaxPooling_Backward:
for(int n = _rl; n < _ru; n++)
- doPoolingBackward(n, _params);
- break;
- case BiasAdd:
- {
- double [] dest = _params.output.getDenseBlock();
- ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q,
- _rl, _ru, _binaryElementWiseAddition);
+ doPoolingBackward(n, _params, false);
break;
- }
- case BiasMultiply:
- {
- double [] dest = _params.output.getDenseBlock();
- ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q,
- _rl, _ru, _binaryElementWiseMultiplication);
+ case MaxPooling_Relu_Backward:
+ for(int n = _rl; n < _ru; n++)
+ doPoolingBackward(n, _params, true);
break;
- }
case ReluBackward:
lnnz = doReluBackward(_params, _rl, _ru);
break;
case LoopedIm2ColConv2d:
{
- MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
- double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
- for(int n = _rl; n < _ru; n++)
- doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
- _im2ColOutBlocks.add(im2ColOutBlock);
- if(_params.bias != null)
- ConvolutionUtils.binaryBiasOperationInPlace(_params.bias, _params.output.getDenseBlock(), _params.K,
- _params.P*_params.Q, _rl, _ru, _binaryElementWiseAddition);
+ if(isEligibleForConv2dSparse(_params)) {
+ // NativeHelper.conv2dSparse only if filter is dense and input is sparse
+ int KPQ = _params.K*_params.P*_params.Q;
+ double[] temp = new double[KPQ];
+ for(int n = _rl; n < _ru; n++) {
+ if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+ int apos = _params.input1.getSparseBlock().pos(n);
+ int alen = _params.input1.getSparseBlock().size(n);
+ int[] aix = _params.input1.getSparseBlock().indexes(n);
+ double[] avals = _params.input1.getSparseBlock().values(n);
+ NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp,
+ 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S,
+ _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+ System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ);
+ }
+ }
+ }
+ else {
+ // In all other cases, perform im2col in Java + matmult (either native or java).
+ MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+ double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+ for(int n = _rl; n < _ru; n++)
+ doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp);
+ _im2ColOutBlocks.add(im2ColOutBlock);
+ }
+ if(_params.bias != null) {
+ // bias is always converted to dense format
+ double [] biasArr = _params.bias.getDenseBlock();
+ int PQ = _params.P*_params.Q;
+ int index = _rl*_params.K*PQ;
+ for(int n = _rl; n < _ru; n++) {
+ for(int k = 0; k < _params.K; k++) {
+ for(int pq = 0; pq < PQ; pq++, index++) {
+ _params.output.denseBlock[index] += biasArr[k];
+ }
+ }
+ }
+ }
break;
}
case LoopedIm2ColConv2dBwdFilter:
{
- MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
MatrixBlock partialRetBlock = _partialRetBlocks.remove();
MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
- double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
- for(int n = _rl; n < _ru; n++)
- partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
- _im2ColOutBlocks.add(im2ColOutBlock);
- _partialRetBlocks.add(partialRetBlock);
+ if(isEligibleForConv2dBackwardFilterSparseDense(_params)) {
+ double [] dout_n = doutReshapedBlock.getDenseBlock();
+ for(int n = _rl; n < _ru; n++) {
+ if( !_params.input1.getSparseBlock().isEmpty(n) ) {
+ doRotate180(n, 0, _params.input2, dout_n, _params, true);
+ int apos = _params.input1.getSparseBlock().pos(n);
+ int alen = _params.input1.getSparseBlock().size(n);
+ int[] aix = _params.input1.getSparseBlock().indexes(n);
+ double[] avals = _params.input1.getSparseBlock().values(n);
+ NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals,
+ dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, _params.K,
+ _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+ }
+ }
+ }
+ else {
+ MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove();
+ double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null;
+ for(int n = _rl; n < _ru; n++)
+ partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp);
+ _im2ColOutBlocks.add(im2ColOutBlock);
+ }
_doutReshapedBlocks.add(doutReshapedBlock);
+ _partialRetBlocks.add(partialRetBlock);
break;
}
case LoopedIm2ColConv2dBwdData:
{
MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove();
- for(int n = _rl; n < _ru; n++)
- doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+ if(isEligibleForConv2dBackwardDataDense(_params)) {
+ int CHW = _params.C*_params.H*_params.W;
+ double [] ret = new double[CHW];
+ double [] filterArr = _params.input1.getDenseBlock();
+ for(int n = _rl; n < _ru; n++) {
+ double [] dout_n = getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock());
+ if(n > _rl)
+ Arrays.fill(ret, 0);
+ NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1,
+ _params.C, _params.H, _params.W, _params.K,
+ _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1);
+ System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW);
+ }
+ }
+ else {
+ for(int n = _rl; n < _ru; n++)
+ doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+ }
_doutReshapedBlocks.add(doutReshapedBlock);
break;
}
@@ -1192,16 +1351,24 @@ public class LibMatrixDNN {
}
// Returns the row of matrix in dense format
- private static double [] getRowInDenseFormat(MatrixBlock input, int n, double [] temp) {
+ private static double [] getRowInDenseFormat(MatrixBlock input, int n, double [] temp) throws DMLRuntimeException {
+ if(input.getNumColumns() != temp.length) {
+ throw new DMLRuntimeException("Invalid parameters");
+ }
// Use temporary array to avoid binary search
- Arrays.fill(temp, 0);
- if( !input.sparseBlock.isEmpty(n) ) {
- int apos = input.sparseBlock.pos(n);
- int alen = input.sparseBlock.size(n);
- int[] aix = input.sparseBlock.indexes(n);
- double[] avals = input.sparseBlock.values(n);
- for(int j=apos; j<apos+alen; j++)
- temp[ aix[j] ] = avals[j];
+ if(input.isInSparseFormat()) {
+ Arrays.fill(temp, 0);
+ if( !input.sparseBlock.isEmpty(n) ) {
+ int apos = input.sparseBlock.pos(n);
+ int alen = input.sparseBlock.size(n);
+ int[] aix = input.sparseBlock.indexes(n);
+ double[] avals = input.sparseBlock.values(n);
+ for(int j=apos; j<apos+alen; j++)
+ temp[ aix[j] ] = avals[j];
+ }
+ }
+ else {
+ System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns());
}
return temp;
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 5c7c2ef..983ce53 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -17,7 +17,6 @@
* under the License.
*/
-
package org.apache.sysml.runtime.matrix.data;
import java.util.ArrayList;
@@ -123,7 +122,7 @@ public class LibMatrixMult
{
matrixMult(m1, m2, ret, rl, ru, true);
}
-
+
public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity)
throws DMLRuntimeException
{
@@ -132,9 +131,9 @@ public class LibMatrixMult
ret.examSparsity(); //turn empty dense into sparse
return;
}
-
+
//Timing time = new Timing(true);
-
+
//pre-processing: output allocation
boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
m2 = prepMatrixMultRightInput(m1, m2);
@@ -166,6 +165,7 @@ public class LibMatrixMult
if(examSparsity)
ret.examSparsity();
+
//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
}
@@ -188,7 +188,7 @@ public class LibMatrixMult
ret.examSparsity(); //turn empty dense into sparse
return;
}
-
+
//check too high additional vector-matrix memory requirements (fallback to sequential)
//check too small workload in terms of flops (fallback to sequential too)
if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse())
@@ -247,6 +247,7 @@ public class LibMatrixMult
throw new DMLRuntimeException(ex);
}
+
//post-processing (nnz maintained in parallel)
ret.examSparsity();
@@ -3960,4 +3961,4 @@ public class LibMatrixMult
return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
}
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
new file mode 100644
index 0000000..4b12596
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixNative {
+
+ // We could encapsulate heuristics in this function
+ // For now, we only consider matrix-vector operation to be memory bound
+ private static boolean isMatMultMemoryBound(int m1Rlen, int m1Clen, int m2Clen) {
+ return m1Rlen == 1 || m1Clen == 1 || m2Clen == 1;
+ }
+
+ /**
+ * Performs matrix multiplication using native library if BLAS is available or else falls back to
+ * Java BLAS.
+ *
+ * @param m1 lhs matrix block
+ * @param m2 rhs matrix block
+ * @param ret output matrix block
+ * @param k number of threads
+ * @throws DMLRuntimeException if error occurs
+ */
+ public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException {
+ matrixMult(m1, m2, ret, k, true);
+ }
+
+ public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, boolean examSparsity) throws DMLRuntimeException {
+ // Sanity check:
+ k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
+
+ // check inputs / outputs
+ if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+ ret.setNonZeros(0);
+ if(examSparsity)
+ ret.examSparsity(); // turn empty dense into sparse
+ return;
+ }
+ if (NativeHelper.isNativeLibraryLoaded() &&
+ !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+ ret.sparse = false;
+ ret.allocateDenseBlock();
+ if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock,
+ ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
+ Statistics.numNativeLibMatrixMultCalls.increment();
+ ret.recomputeNonZeros();
+ // post-processing (nnz maintained in parallel)
+ if(examSparsity)
+ ret.examSparsity();
+ return;
+ } else {
+ // Else fall back to Java
+ Statistics.incrementNativeFailuresCounter();
+ }
+ }
+ if (k == 1)
+ LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
+ else
+ LibMatrixMult.matrixMult(m1, m2, ret, k);
+ }
+
+ /**
+ * This method performs convolution (i.e. cross-correlation) operation on input
+ *
+ * @param input input batch
+ * @param filter filter
+ * @param outputBlock output of convolution
+ * @param params convolution parameters
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params);
+ params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+ if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
+ setNumThreads(params);
+ if(params.bias == null) {
+ if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W,
+ params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
+ params.P, params.Q, params.numThreads)) {
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+ // post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ return;
+ }
+ else {
+ // Fall back to Java when failures
+ Statistics.incrementNativeFailuresCounter();
+ }
+ }
+ else {
+ if(params.bias.isInSparseFormat())
+ params.bias.sparseToDense(); // Bias matrix is usually extremely small
+ if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock,
+ params.N, params.C, params.H, params.W,
+ params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
+ params.P, params.Q, params.numThreads)) {
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+ // post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ return;
+ }
+ else {
+ // Fall back to Java when failures
+ Statistics.incrementNativeFailuresCounter();
+ }
+ }
+ }
+
+ // Fall back to Java when failures or sparse
+ LibMatrixDNN.conv2d(input, filter, outputBlock, params);
+ }
+
+ private static void setNumThreads(ConvolutionParameters params) {
+ params.numThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+ if (!(params.isOutputThreadSafe() && params.numThreads > 1))
+ params.numThreads = 1;
+ }
+
+ /**
+ * This method computes the backpropogation errors for filter of convolution operation
+ *
+ * @param input input image
+ * @param dout errors from next layer
+ * @param outputBlock output errors
+ * @param params convolution parameters
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ LibMatrixDNN.checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
+ params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+ if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
+ setNumThreads(params);
+ if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W,
+ params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
+ params.P, params.Q, params.numThreads)) {
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+ // post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ return;
+ }
+ else {
+ // Fall back to Java when failures
+ Statistics.incrementNativeFailuresCounter();
+ }
+ }
+ // Fall back to Java when failures or sparse
+ LibMatrixDNN.conv2dBackwardFilter(input, dout, outputBlock, params);
+ }
+
+ /**
+ * This method computes the backpropogation errors for previous layer of convolution operation
+ *
+ * @param filter filter used in conv2d
+ * @param dout errors from next layer
+ * @param outputBlock output errors
+ * @param params convolution parameters
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ LibMatrixDNN.checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
+ params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
+ if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
+ setNumThreads(params);
+ if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W,
+ params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
+ params.P, params.Q, params.numThreads)) {
+ Statistics.numNativeLibMatrixDNNCalls.increment();
+ // post-processing: maintain nnz
+ outputBlock.recomputeNonZeros();
+ return;
+ }
+ else {
+ // Fall back to Java when failures
+ Statistics.incrementNativeFailuresCounter();
+ }
+ }
+ // Fall back to Java when failures or sparse
+ LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index c458b07..8fac947 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -35,6 +35,7 @@ import java.util.stream.LongStream;
import org.apache.commons.math3.random.Well1024a;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.hops.Hop.OpOp2;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.lops.MMTSJ.MMTSJType;
@@ -4873,15 +4874,26 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
return sum_wt;
}
+
+ public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value,
+ MatrixValue result, AggregateBinaryOperator op ) throws DMLRuntimeException
+ {
+ boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+ return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+ }
public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value,
- MatrixValue result, AggregateBinaryOperator op )
- throws DMLRuntimeException
+ MatrixValue result, AggregateBinaryOperator op, boolean enableNativeBLAS ) throws DMLRuntimeException
{
- return aggregateBinaryOperations(m1Value, m2Value, result, op);
+ return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
+ }
+
+ public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) throws DMLRuntimeException {
+ boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+ return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS);
}
- public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op)
+ public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op, boolean nativeMatMult)
throws DMLRuntimeException
{
//check input types, dimensions, configuration
@@ -4907,7 +4919,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros);
//compute matrix multiplication (only supported binary aggregate operation)
- if( op.getNumThreads() > 1 )
+ if( nativeMatMult )
+ LibMatrixNative.matrixMult(m1, m2, ret, op.getNumThreads());
+ else if( op.getNumThreads() > 1 )
LibMatrixMult.matrixMult(m1, m2, ret, op.getNumThreads());
else
LibMatrixMult.matrixMult(m1, m2, ret);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index b988546..8e51405 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.util;
import java.util.Arrays;
import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
@@ -58,12 +60,20 @@ public class ConvolutionUtils {
}
return ret;
}
+
- // Performs dest[destPos ...] <- src[src_rl:src_ru, ]
- //Assumes that dest is zeroed-out before calling
- public static void copy(MatrixBlock src, double [] dest, int destPos, int destNumCols, int src_rl, int src_ru) {
+ // Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
+ public static void binaryOperationInPlace(MatrixBlock src, double [] dest,
+ int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
if(src.isInSparseFormat()) {
- if(!src.isEmptyBlock()) {
+ if(src.isEmptyBlock() && op.fn == Plus.getPlusFnObject()) {
+ // Do nothing: Inplace addition by zero
+ }
+ else if(src.isEmptyBlock() && op.fn == Multiply.getMultiplyFnObject()) {
+ // Inplace multiplication by zero
+ Arrays.fill(dest, destPos, destPos + (src_ru-src_rl)*destNumCols, 0);
+ }
+ else if(op.fn == Plus.getPlusFnObject()) {
for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
if( !src.getSparseBlock().isEmpty(i) ) {
int apos = src.getSparseBlock().pos(i);
@@ -71,37 +81,54 @@ public class ConvolutionUtils {
int[] aix = src.getSparseBlock().indexes(i);
double[] avals = src.getSparseBlock().values(i);
for(int j = apos; j < apos+alen; j++) {
- dest[ cix+aix[j] ] = avals[j];
+ dest[ cix+aix[j] ] += avals[j];
}
}
}
}
- }
- else {
- System.arraycopy(src.getDenseBlock(), src_rl*src.getNumColumns(), dest, destPos, (src_ru-src_rl)*src.getNumColumns());
- }
- }
-
- // Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
- public static void binaryOperationInPlace(MatrixBlock src, double [] dest,
- int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException {
- if(src.isInSparseFormat()) {
- for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
- if( !src.getSparseBlock().isEmpty(i) ) {
- int apos = src.getSparseBlock().pos(i);
- int alen = src.getSparseBlock().size(i);
- int[] aix = src.getSparseBlock().indexes(i);
- double[] avals = src.getSparseBlock().values(i);
- for(int j = apos; j < apos+alen; j++) {
- dest[ cix+aix[j] ] = op.fn.execute(dest[ cix+aix[j] ], avals[j]);
+ else if(op.fn == Multiply.getMultiplyFnObject()) {
+ // Unsafe operation
+ for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) {
+ if( !src.getSparseBlock().isEmpty(i) ) {
+ int apos = src.getSparseBlock().pos(i);
+ int alen = src.getSparseBlock().size(i);
+ int[] aix = src.getSparseBlock().indexes(i);
+ double[] avals = src.getSparseBlock().values(i);
+ int prevDestIndex = 0;
+ for(int j = apos; j < apos+alen; j++) {
+ // Multiplication by zero. Assumption: aix is sorted.
+ Arrays.fill(dest, cix+prevDestIndex, aix[j], 0);
+ prevDestIndex = aix[j]+1;
+ dest[ cix+aix[j] ] *= avals[j];
+ }
+ Arrays.fill(dest, cix+prevDestIndex, cix+destNumCols, 0);
+ }
+ else {
+ Arrays.fill(dest, cix, cix + destNumCols, 0);
}
}
}
+ else {
+ // As operation could be safe or unsafe. This will be caught at development time.
+ throw new DMLRuntimeException("Unimplemented sparse operation");
+ }
}
else {
double [] inputArr = src.getDenseBlock();
- for(int i = destPos; i < src_ru*destNumCols; i++) {
- dest[i] = op.fn.execute(dest[i], inputArr[i]);
+ if(op.fn == Plus.getPlusFnObject()) {
+ for(int i = destPos; i < src_ru*destNumCols; i++) {
+ dest[i] += inputArr[i];
+ }
+ }
+ else if(op.fn == Multiply.getMultiplyFnObject()) {
+ for(int i = destPos; i < src_ru*destNumCols; i++) {
+ dest[i] *= inputArr[i];
+ }
+ }
+ else {
+ for(int i = destPos; i < src_ru*destNumCols; i++) {
+ dest[i] = op.fn.execute(dest[i], inputArr[i]);
+ }
}
}
}
@@ -130,45 +157,6 @@ public class ConvolutionUtils {
}
}
- // dest (of size N x KPQ) = input (of size N x KPQ) op bias (of size K x 1)
- public static void binaryBiasOperations(MatrixBlock input, MatrixBlock bias, double [] dest,
- int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
- copy(input, dest, rl*K*PQ, K*PQ, rl, ru);
- binaryBiasOperationInPlace(bias, dest, K, PQ, rl, ru, op);
- }
-
- // dest (of size N x KPQ) op= bias (of size K x 1)
- public static void binaryBiasOperationInPlace(MatrixBlock bias, double [] dest,
- int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException {
- // bias.getNumColumns() == 1 checked outside
- if(!bias.isInSparseFormat()) {
- double [] biasArr = bias.getDenseBlock();
- int index = rl*K*PQ;
- for(int n = rl; n < ru; n++) {
- for(int k = 0; k < K; k++) {
- for(int pq = 0; pq < PQ; pq++, index++) {
- dest[index] = op.fn.execute(dest[index], biasArr[k]);
- }
- }
- }
- }
- else {
- for(int k = 0; k < K; k++) {
- if( !bias.getSparseBlock().isEmpty(k) ) {
- int apos = bias.getSparseBlock().pos(k);
- double[] avals = bias.getSparseBlock().values(k);
- double val = avals[apos];
- for(int n = rl; n < ru; n++) {
- int index = n*K*PQ + k*PQ;
- for(int pq = 0; pq < PQ; pq++, index++) {
- dest[index] = op.fn.execute(dest[index], val);
- }
- }
- }
- }
- }
- }
-
public static void fillBias(MatrixBlock bias, double [] outputArray, int src_rl, int src_ru, int N, int K, int PQ) throws DMLRuntimeException {
// bias.getNumColumns() == 1 checked outside
if(bias.isInSparseFormat()) {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
new file mode 100644
index 0000000..c1b4c3e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.utils;
+
+
+/**
+ * This class is useful in setting environment variable for loading MKL library (done by Native Helper)
+ */
+public class EnvironmentHelper {
+ public static native void setEnv(String key, String value);
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
new file mode 100644
index 0000000..2b997ed
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.utils;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.HashMap;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.SystemUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
+
+/**
+ * This class helps in loading native library.
+ * By default, it first tries to load Intel MKL, else tries to load OpenBLAS.
+ */
+public class NativeHelper {
+ private static boolean isSystemMLLoaded = false;
+ private static final Log LOG = LogFactory.getLog(NativeHelper.class.getName());
+ private static HashMap<String, String> supportedArchitectures = new HashMap<String, String>();
+ public static String blasType;
+ private static int maxNumThreads = -1;
+ private static boolean setMaxNumThreads = false;
+ static {
+ // Note: we only support 64 bit Java on x86 and AMD machine
+ supportedArchitectures.put("x86_64", "x86_64");
+ supportedArchitectures.put("amd64", "x86_64");
+ }
+
+ private static boolean attemptedLoading = false;
+
+ // Performing loading in a method instead of a static block will throw a detailed stack trace in case of fatal errors
+ private static void init() {
+ // Only Linux supported for BLAS
+ if(!SystemUtils.IS_OS_LINUX)
+ return;
+
+ // attemptedLoading variable ensures that we don't try to load SystemML and other dependencies
+ // again and again especially in the parfor (hence the double-checking with synchronized).
+ if(!attemptedLoading) {
+ DMLConfig dmlConfig = ConfigurationManager.getDMLConfig();
+ String userSpecifiedBLAS = System.getenv("SYSTEMML_BLAS");
+ userSpecifiedBLAS = (userSpecifiedBLAS == null) ? "" : userSpecifiedBLAS.trim().toLowerCase();
+ // -------------------------------------------------------------------------------------
+ // We allow BLAS to be enabled or disabled or explicitly selected in one of the two ways:
+ // 1. DML Configuration: native.blas (boolean flag)
+ // 2. Environment variable: SYSTEMML_BLAS (can be set to mkl, openblas or none)
+ // The option 1 will be removed in later SystemML versions.
+ // The option 2 is useful for two reasons:
+ // - Developer testing of different BLAS
+ // - Provides fine-grained control. Certain machines could use mkl while others use openblas, etc.
+ boolean enabledViaConfig = (dmlConfig == null) ? true : dmlConfig.getBooleanValue(DMLConfig.NATIVE_BLAS);
+ boolean enabledViaEnvironmentVariable = userSpecifiedBLAS.equals("") || userSpecifiedBLAS.equals("mkl") || userSpecifiedBLAS.equals("openblas");
+
+ if(enabledViaConfig && enabledViaEnvironmentVariable) {
+ long start = System.nanoTime();
+ if(!supportedArchitectures.containsKey(SystemUtils.OS_ARCH)) {
+ LOG.warn("Unsupported architecture for native BLAS:" + SystemUtils.OS_ARCH);
+ return;
+ }
+ synchronized(NativeHelper.class) {
+ if(!attemptedLoading) {
+ // -----------------------------------------------------------------------------
+ // =============================================================================
+ // By default, we will native.blas=true and we will attempt to load MKL first.
+ // If MKL is not enabled then we try to load OpenBLAS.
+ // If both MKL and OpenBLAS are not available we fall back to Java BLAS.
+ if(userSpecifiedBLAS.equalsIgnoreCase("")) {
+ blasType = isMKLAvailable() ? "mkl" : isOpenBLASAvailable() ? "openblas" : null;
+ if(blasType == null)
+ LOG.warn("Unable to load either MKL or OpenBLAS");
+ }
+ else if(userSpecifiedBLAS.equalsIgnoreCase("mkl")) {
+ blasType = isMKLAvailable() ? "mkl" : null;
+ if(blasType == null)
+ LOG.warn("Unable to load MKL");
+ }
+ else if(userSpecifiedBLAS.equalsIgnoreCase("openblas")) {
+ blasType = isOpenBLASAvailable() ? "openblas" : null;
+ if(blasType == null)
+ LOG.warn("Unable to load OpenBLAS");
+ }
+ else {
+ LOG.warn("Unsupported BLAS:" + userSpecifiedBLAS);
+ }
+ // =============================================================================
+ if(blasType != null && loadLibraryHelper("libsystemml_" + blasType + "-Linux-x86_64.so")) {
+ LOG.info("Using native blas: " + blasType);
+ isSystemMLLoaded = true;
+ }
+ }
+ }
+ double timeToLoadInMilliseconds = (System.nanoTime()-start)*1e-6;
+ if(timeToLoadInMilliseconds > 100)
+ LOG.warn("Time to load native blas: " + timeToLoadInMilliseconds + " milliseconds.");
+ }
+ else {
+ if(enabledViaConfig)
+ LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the configuration 'native.blas'.");
+ else
+ LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the environment variable 'SYSTEMML_BLAS=" + userSpecifiedBLAS + "'.");
+ }
+ attemptedLoading = true;
+ }
+ }
+
+ public static boolean isNativeLibraryLoaded() {
+ init();
+ if(maxNumThreads == -1)
+ maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+ if(isSystemMLLoaded && !setMaxNumThreads && maxNumThreads != -1) {
+ // This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+ // In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest.
+ // We can revisit this decision later and hence I would not recommend removing this method.
+ setMaxNumThreads(maxNumThreads);
+ setMaxNumThreads = true;
+ }
+ return isSystemMLLoaded;
+ }
+
+ public static int getMaxNumThreads() {
+ if(maxNumThreads == -1)
+ maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+ return maxNumThreads;
+ }
+
+
+ private static boolean isMKLAvailable() {
+ // ------------------------------------------------------------
+ // Set environment variable MKL_THREADING_LAYER to GNU on Linux for performance
+ if(!loadLibraryHelper("libpreload_systemml-Linux-x86_64.so")) {
+ LOG.warn("Unable to load preload_systemml (required for loading MKL-enabled SystemML library)");
+ return false;
+ }
+ // The most reliable way in my investigation to ensure that MKL runs smoothly with OpenMP (used by conv2d*)
+ // is setting the environment variable MKL_THREADING_LAYER to GNU
+ EnvironmentHelper.setEnv("MKL_THREADING_LAYER", "GNU");
+ if(!loadBLAS("gomp", "gomp required for loading MKL-enabled SystemML library"))
+ return false;
+
+ // ------------------------------------------------------------
+ return loadBLAS("mkl_rt", null);
+ }
+
+ private static boolean isOpenBLASAvailable() {
+ if(!loadBLAS("gomp", "gomp required for loading OpenBLAS-enabled SystemML library"))
+ return false;
+ return loadBLAS("openblas", null);
+ }
+
+ private static boolean loadBLAS(String blas, String optionalMsg) {
+ try {
+ System.loadLibrary(blas);
+ return true;
+ }
+ catch (UnsatisfiedLinkError e) {
+ if(optionalMsg != null)
+ LOG.warn("Unable to load " + blas + "(" + optionalMsg + "):" + e.getMessage());
+ else
+ LOG.warn("Unable to load " + blas + ":" + e.getMessage());
+ return false;
+ }
+ }
+
+ private static boolean loadLibraryHelper(String path) {
+ InputStream in = null; OutputStream out = null;
+ try {
+ // This logic is added because Java doesnot allow to load library from a resource file.
+ in = NativeHelper.class.getResourceAsStream("/lib/"+path);
+ if(in != null) {
+ File temp = File.createTempFile(path, "");
+ temp.deleteOnExit();
+ out = FileUtils.openOutputStream(temp);
+ IOUtils.copy(in, out);
+ in.close(); in = null;
+ out.close(); out = null;
+ System.load(temp.getAbsolutePath());
+ return true;
+ }
+ else
+ LOG.warn("No lib available in the jar:" + path);
+ } catch(IOException e) {
+ LOG.warn("Unable to load library " + path + " from resource:" + e.getMessage());
+ } finally {
+ if(out != null)
+ try {
+ out.close();
+ } catch (IOException e) {}
+ if(in != null)
+ try {
+ in.close();
+ } catch (IOException e) {}
+ }
+ return false;
+ }
+
+ // TODO: Add pmm, wsloss, mmchain, etc.
+ public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+ private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
+
+ // ----------------------------------------------------------------------------------------------------------------
+ // LibMatrixDNN operations:
+ // N = number of images, C = number of channels, H = image height, W = image width
+ // K = number of filters, R = filter height, S = filter width
+ // TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority.
+
+ // Called by ConvolutionCPInstruction if both input and filter are dense
+ public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ // Called by ConvolutionCPInstruction if both input and filter are dense
+ public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ // If both filter and dout are dense, then called by ConvolutionCPInstruction
+ // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
+ public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+
+ // Currently only supported with numThreads = 1 and sparse input
+ // Called by LibMatrixDNN's thread if input is sparse. dout[n] is converted to dense if sparse.
+ public static native boolean conv2dBackwardFilterSparseDense(int apos, int alen, int[] aix, double[] avals, double [] rotatedDoutPtr, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ // Called by LibMatrixDNN's thread if input is sparse and filter is dense
+ public static native boolean conv2dSparse(int apos, int alen, int[] aix, double[] avals, double [] filter, double [] ret, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ // ----------------------------------------------------------------------------------------------------------------
+
+ // This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs.
+ // In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest.
+ // We can revisit this decision later and hence I would not recommend removing this method.
+ private static native void setMaxNumThreads(int numThreads);
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java
index 097f58e..97888cb 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -112,6 +112,17 @@ public class Statistics
return numExecutedMRJobs.longValue();
}
+ private static LongAdder numNativeFailures = new LongAdder();
+ public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
+ public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+ public static void incrementNativeFailuresCounter() {
+ numNativeFailures.increment();
+ // This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case.
+ // Note: all the native calls have a fallback to Java; so if the user wants she can recompile SystemML by
+ // commenting this exception and everything should work fine.
+ throw new RuntimeException("Unexpected ERROR: OOM caused during JNI transfer. Please disable native BLAS by setting enviroment variable: SYSTEMML_BLAS=none");
+ }
+
public static void incrementNoOfExecutedMRJobs() {
numExecutedMRJobs.increment();
}
@@ -366,6 +377,9 @@ public class Statistics
resetCPHeavyHitters();
GPUStatistics.reset();
+ numNativeLibMatrixMultCalls.reset();
+ numNativeLibMatrixDNNCalls.reset();
+ numNativeFailures.reset();
LibMatrixDNN.resetStatistics();
}
@@ -621,6 +635,11 @@ public class Statistics
//show extended caching/compilation statistics
if( DMLScript.STATISTICS )
{
+ if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 ||
+ numNativeLibMatrixDNNCalls.longValue() > 0)) {
+ String blas = NativeHelper.blasType != null ? NativeHelper.blasType : "";
+ sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue() + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+ }
sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n");
sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n");
sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n");