You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/08/11 23:06:16 UTC
incubator-systemml git commit: [SYSTEMML-540] Performance improvement
of sparse convolution and maxpooling functions
Repository: incubator-systemml
Updated Branches:
refs/heads/master 96d021930 -> 4feb98f28
[SYSTEMML-540] Performance improvement of sparse convolution and
maxpooling functions
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/4feb98f2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/4feb98f2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/4feb98f2
Branch: refs/heads/master
Commit: 4feb98f28acbc858452d64b2229bed10241bd371
Parents: 96d0219
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Aug 11 16:02:09 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Aug 11 16:04:49 2016 -0700
----------------------------------------------------------------------
.../cp/ConvolutionCPInstruction.java | 1 -
.../sysml/runtime/matrix/data/LibMatrixDNN.java | 263 ++++++++++++-------
2 files changed, 171 insertions(+), 93 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4feb98f2/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index 6c27128..c0a1af5 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -161,7 +161,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
.getLongValue();
}
- // TODO: optimize "Sparse operations" once we are happy with the performance of single node Lenet script on dense MNIST dataset
@Override
public void processInstruction(ExecutionContext ec)
throws DMLRuntimeException {
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4feb98f2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 855d991..2374931 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -54,6 +54,14 @@ public class LibMatrixDNN {
non_zeroed_double_arr.put(arr.length, new SoftReference<double[]>(arr));
}
}
+ private static boolean warnedSingleThread = false;
+ private static void warnSingleThreaded() {
+ if(!warnedSingleThread) {
+ throw new RuntimeException("WARN: Single thread execution in LibMatrixDNN");
+ // LOG.warn("WARN: Single thread execution in LibMatrixDNN");
+ // warnedSingleThread = true;
+ }
+ }
public static double[] getReuseableData(long length) {
if(length >= NON_ZEROED_DOUBLE_ARR_THRESHOLD) {
// Explicit "new Integer" required here for HashMap.remove
@@ -89,19 +97,33 @@ public class LibMatrixDNN {
private static AtomicLong im2colDenseCount = new AtomicLong(0);
private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0);
private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0);
+ private static AtomicLong loopedConvMatMultTime = new AtomicLong(0);
+ private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdMatMultTime = new AtomicLong(0);
+ private static AtomicLong loopedConvBwdIm2ColTime = new AtomicLong(0);
+
public static void appendStatistics(StringBuilder sb) {
- sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t"
- + conv2dDenseCount.get() + "/"
- + conv2dBwdFilterDenseCount.get() + "/"
- + conv2dBwdDataDenseCount.get() + "/"
- + im2colDenseCount.get() + "/"
- + maxPoolBwdDenseCount.get() + ".\n");
- sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t"
- + conv2dSparseCount.get() + "/"
- + conv2dBwdFilterSparseCount.get() + "/"
- + conv2dBwdDataSparseCount.get() + "/"
- + im2colSparseCount.get() + "/"
- + maxPoolBwdSparseCount.get() + ".\n");
+ if(DMLScript.STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
+ sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t"
+ + conv2dDenseCount.get() + "/"
+ + conv2dBwdFilterDenseCount.get() + "/"
+ + conv2dBwdDataDenseCount.get() + "/"
+ + im2colDenseCount.get() + "/"
+ + maxPoolBwdDenseCount.get() + ".\n");
+ sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t"
+ + conv2dSparseCount.get() + "/"
+ + conv2dBwdFilterSparseCount.get() + "/"
+ + conv2dBwdDataSparseCount.get() + "/"
+ + im2colSparseCount.get() + "/"
+ + maxPoolBwdSparseCount.get() + ".\n");
+ if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
+ sb.append("LibMatrixDNN conv(im2col/matmult), bwdFil (im2col/matmult) time:\t" +
+ String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvBwdIm2ColTime.get()*1e-9) + "/" +
+ String.format("%.3f", loopedConvBwdMatMultTime.get()*1e-9) + " sec.\n");
+ }
+ }
}
public static void resetStatistics() {
conv2dDenseCount.set(0);
@@ -115,6 +137,11 @@ public class LibMatrixDNN {
conv2dBwdDataSparseCount.set(0);
im2colSparseCount.set(0);
maxPoolBwdSparseCount.set(0);
+
+ loopedConvIm2ColTime.set(0);
+ loopedConvMatMultTime.set(0);
+ loopedConvBwdMatMultTime.set(0);
+ loopedConvBwdIm2ColTime.set(0);
}
public static class ConvolutionParameters {
@@ -228,6 +255,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
if(useMemoryLessConvolution) {
for (int c = 0; c < params.C; c++) {
for (int k = 0; k < params.K; k++) {
@@ -316,16 +344,24 @@ public class LibMatrixDNN {
private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n,
MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params) throws DMLRuntimeException {
long nnz = 0;
+ long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
for (int c = 0; c < params.C; c++) {
nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, params);
}
+ long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
im2ColOutBlock.setNonZeros(nnz);
doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
dout_reshaped.recomputeNonZeros();
MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
+ long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp);
+ long t4 = DMLScript.STATISTICS ? System.nanoTime() : 0 ;
+ if(DMLScript.STATISTICS) {
+ loopedConvBwdMatMultTime.addAndGet(t4-t3);
+ loopedConvBwdIm2ColTime.addAndGet(t2-t1);
+ }
elementWiseInPlaceTransposedAddition(partialRetBlock, temp);
return partialRetBlock;
@@ -489,6 +525,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
if(useMemoryLessConvolution) {
for (int n = 0; n < params.N; n++) {
for (int k = 0; k < params.K; k++) {
@@ -514,12 +551,22 @@ public class LibMatrixDNN {
private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params) throws DMLRuntimeException {
long nnz = 0;
+ long t1 = DMLScript.STATISTICS ? System.nanoTime() : 0;
for (int c = 0; c < params.C; c++) {
nnz += doIm2colOverInputPath_NCHW(n, c, im2ColOutBlock, params);
}
+ long t2 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+
im2ColOutBlock.setNonZeros(nnz);
MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock);
+ long t3 = DMLScript.STATISTICS ? System.nanoTime() : 0;
+
+ if(DMLScript.STATISTICS) {
+ loopedConvIm2ColTime.addAndGet(t2 - t1);
+ loopedConvMatMultTime.addAndGet(t3 - t2);
+ }
+
if(matMultOutBlock.isInSparseFormat()) {
Iterator<IJV> iter = matMultOutBlock.sparseBlock.getIterator();
final int outOffset = n*params.K*params.P*params.Q;
@@ -608,6 +655,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
for (int n = 0; n < params.N; n++) {
doPoolingBackward(n, params);
}
@@ -686,25 +734,23 @@ public class LibMatrixDNN {
int [] maxIndexArrS = params.tmpData.maxIndexArrS;
final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
- Iterator<IJV> iter = params.input2.sparseBlock.getIterator();
+ Iterator<IJV> iter = params.input2.sparseBlock.getIterator(k, k+1);
int [] tensorIndexes = new int[4];
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.K, params.C, params.R, params.S);
- if(k == tensorIndexes[0]) {
- int c = tensorIndexes[1];
- int r = tensorIndexes[2];
- int s = tensorIndexes[3];
- double filterVal = ijv.getV();
- final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W + s - params.pad_w;
- for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) {
- final int hOffset = inputOffset + (p*params.stride_h + r - params.pad_h)*params.W;
- final int pOffset = outputOffset + p*params.Q;
- for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
- final int w = q*params.stride_w;
- outputArray[pOffset + q] += inputArray[hOffset + w]*filterVal;
- }
+ int c = tensorIndexes[1];
+ int r = tensorIndexes[2];
+ int s = tensorIndexes[3];
+ double filterVal = ijv.getV();
+ final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W + s - params.pad_w;
+ for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) {
+ final int hOffset = inputOffset + (p*params.stride_h + r - params.pad_h)*params.W;
+ final int pOffset = outputOffset + p*params.Q;
+ for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) {
+ final int w = q*params.stride_w;
+ outputArray[pOffset + q] += inputArray[hOffset + w]*filterVal;
}
}
}
@@ -714,7 +760,7 @@ public class LibMatrixDNN {
double [] outputArray = params.output.getDenseBlock();
int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q;
- Iterator<IJV> iter = params.input1.sparseBlock.getIterator();
+ Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1);
int [] tensorIndexes = new int[4];
int [] minIndexArrR = params.tmpData.minIndexArrR;
@@ -722,22 +768,21 @@ public class LibMatrixDNN {
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.H, params.W);
- if(n == tensorIndexes[0]) {
- int c = tensorIndexes[1];
- int h = tensorIndexes[2];
- int w = tensorIndexes[3];
- double imgVal = ijv.getV();
- for (int r = minIndexArrR[h]; r < params.R; r += params.stride_h) {
- int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
- for (int s = minIndexArrS[w]; s < params.S; s += params.stride_w) {
- int p = (int)Math.ceil(((double)(h + params.pad_h - r)) / params.stride_h);
- int q = (int)Math.ceil(((double)(w + params.pad_w - s)) / params.stride_w);
- if(p >= 0 && p < params.P && q >= 0 && q < params.Q) {
- double filterVal = filterArray[filterOffset + s];
- outputArray[outputOffset + p*params.Q + q] += imgVal*filterVal;
- }
+
+ int c = tensorIndexes[1];
+ int h = tensorIndexes[2];
+ int w = tensorIndexes[3];
+ double imgVal = ijv.getV();
+ for (int r = minIndexArrR[h]; r < params.R; r += params.stride_h) {
+ int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S;
+ for (int s = minIndexArrS[w]; s < params.S; s += params.stride_w) {
+ int p = (int)Math.ceil(((double)(h + params.pad_h - r)) / params.stride_h);
+ int q = (int)Math.ceil(((double)(w + params.pad_w - s)) / params.stride_w);
+ if(p >= 0 && p < params.P && q >= 0 && q < params.Q) {
+ double filterVal = filterArray[filterOffset + s];
+ outputArray[outputOffset + p*params.Q + q] += imgVal*filterVal;
}
- }
+ }
}
}
}
@@ -754,7 +799,7 @@ public class LibMatrixDNN {
int [] tensorIndexesImage = new int[4];
int [] tensorIndexesFilter = new int[4];
- Iterator<IJV> iter = params.input1.sparseBlock.getIterator();
+ Iterator<IJV> iter = params.input1.sparseBlock.getIterator(n, n+1);
while(iter.hasNext()) {
IJV ijv = iter.next();
@@ -765,11 +810,11 @@ public class LibMatrixDNN {
int w = tensorIndexesImage[3];
double imgVal = ijv.getV();
- Iterator<IJV> iter1 = params.input2.sparseBlock.getIterator();
+ Iterator<IJV> iter1 = params.input2.sparseBlock.getIterator(k, k+1);
while(iter1.hasNext()) {
IJV ijv1 = iter1.next();
computeTensorIndexes(ijv1.getI(), ijv1.getJ(), tensorIndexesFilter, params.K, params.C, params.R, params.S);
- if(k == tensorIndexesFilter[0] && c == tensorIndexesFilter[1]) {
+ if(c == tensorIndexesFilter[1]) {
int r = tensorIndexesFilter[2];
int s = tensorIndexesFilter[3];
if((r-minIndexArrR[h])%params.stride_h == 0 && (s-minIndexArrS[w])%params.stride_w == 0) {
@@ -870,70 +915,65 @@ public class LibMatrixDNN {
doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
}
else {
- doPoolingBackwardUnOptimizedSparse_(n, params);
+ if(doutArray != null)
+ doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
+ else
+ doPoolingBackwardSparseSparse(n, outputArray, params);
}
-
}
- private static void doPoolingBackwardUnOptimizedSparse_(int n, ConvolutionParameters params) throws DMLRuntimeException {
+ private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
if (!params.input1.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
- double [] doutArray = null;
- if (!params.input2.isInSparseFormat())
- doutArray = params.input2.getDenseBlock();
- double [] outputArray = null;
- if (!params.output.isInSparseFormat())
- outputArray = params.output.getDenseBlock();
-
+
for (int c = 0; c < params.C; c++) {
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++) {
- int start_index_h = p * params.stride_h - params.pad_h;
- int start_index_w = q * params.stride_w - params.pad_w;
- int end_index_h = Math.min(start_index_h + params.R, params.H);
- int end_index_w = Math.min(start_index_w + params.S, params.W);
- start_index_h = Math.max(start_index_h, 0);
- start_index_w = Math.max(start_index_w, 0);
- int maxIndex = n*params.C*params.H*params.W + c*params.H*params.W + start_index_h*params.W + start_index_w;
- double maxVal = -Double.MAX_VALUE;
-
-
- double currDoutVal = -1;
- for (int h = start_index_h; h < end_index_h; h++) {
- for (int w = start_index_w; w < end_index_w; w++) {
- currDoutVal = params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w);
-
- if(maxVal < currDoutVal) {
- maxIndex = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w;
- maxVal = currDoutVal;
- }
- }
- }
-
- double inVal = -1;
- if(doutArray != null)
- inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q];
- else
- inVal = params.input2.quickGetValue(n, c*params.P*params.Q + p * params.Q + q);
-
- // synchronized(this) {
+ double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q];
+ if(inVal != 0) {
+ final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
+ int start_index_h = p * params.stride_h - params.pad_h;
+ final int end_index_h = Math.min(start_index_h + params.R, params.H);
+ start_index_h = Math.max(start_index_h, 0);
+ int maxIndex = getMaxIndexSparse(start_index_h, end_index_h, q, inputOffset, n, c, params.input1, params);
outputArray[maxIndex] += inVal;
- // }
+ }
}
}
}
}
+ private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+ if (!params.input1.isInSparseFormat())
+ throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
+
+ Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n+1);
+ int [] tensorIndexes = new int[4];
+
+ while(iter.hasNext()) {
+ IJV ijv = iter.next();
+ computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.P, params.Q);
+ int c = tensorIndexes[1];
+ int p = tensorIndexes[2];
+ int q = tensorIndexes[3];
+
+ final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
+ int start_index_h = p * params.stride_h - params.pad_h;
+ final int end_index_h = Math.min(start_index_h + params.R, params.H);
+ start_index_h = Math.max(start_index_h, 0);
+ int maxIndex = getMaxIndexSparse(start_index_h, end_index_h, q, inputOffset, n, c, params.input1, params);
+ outputArray[maxIndex] += ijv.getV();
+ }
+
+ }
+
private static void doPoolingBackwardDenseSparse(int n, double [] inputArray,
MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
- Iterator<IJV> iter = dout.sparseBlock.getIterator();
+ Iterator<IJV> iter = dout.sparseBlock.getIterator(n, n+1);
int [] tensorIndexes = new int[4];
while(iter.hasNext()) {
IJV ijv = iter.next();
- if(ijv.getI() != n)
- continue;
-
computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.P, params.Q);
int c = tensorIndexes[1];
int p = tensorIndexes[2];
@@ -967,6 +1007,41 @@ public class LibMatrixDNN {
}
}
+ private static int getMaxIndexSparse(int start_index_h, int end_index_h,
+ int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
+ if(!input.isInSparseFormat())
+ throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");
+
+ Iterator<IJV> iter = input.sparseBlock.getIterator(n, n+1);
+ int [] tensorIndexes = new int[4];
+
+ int start_index_w = Math.max(q * params.stride_w - params.pad_w, 0);
+ int end_index_w = Math.min(start_index_w + params.S, params.W);
+ start_index_w = Math.max(start_index_w, 0);
+
+ int maxIndex = inputOffset + start_index_h*params.W + start_index_w;
+ double maxVal = -Double.MAX_VALUE;
+
+ // Find maxIndex
+ double currDoutVal = -1;
+ while(iter.hasNext()) {
+ IJV ijv = iter.next();
+ computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.H, params.W);
+ if(c != tensorIndexes[1])
+ continue;
+ int h = tensorIndexes[2];
+ int w = tensorIndexes[3];
+ if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
+ currDoutVal = ijv.getV();
+ if(maxVal < currDoutVal) {
+ maxIndex = inputOffset + h*params.W + w;
+ maxVal = currDoutVal;
+ }
+ }
+ }
+ return maxIndex;
+ }
+
private static int getMaxIndex(int start_index_h, int end_index_h,
int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
int start_index_w = q * params.stride_w - params.pad_w;
@@ -1002,6 +1077,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
for (int n = 0; n < params.N; n++) {
for (int c = 0; c < params.C; c++) {
doPooling(n, c, params);
@@ -1061,6 +1137,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
for (int n = 0; n < params.N; n++) {
doRotate180(n, params);
}
@@ -1102,12 +1179,10 @@ public class LibMatrixDNN {
if(zeroOutSparseOutput)
Arrays.fill(outputArray, 0);
- Iterator<IJV> iter = input.sparseBlock.getIterator();
+ Iterator<IJV> iter = input.sparseBlock.getIterator(inputN, inputN+1);
int [] tensorIndexes = new int[4];
while(iter.hasNext()) {
IJV ijv = iter.next();
- if(ijv.getI() != inputN)
- continue;
computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.K, params.P, params.Q);
int k = tensorIndexes[1];
int p = tensorIndexes[2];
@@ -1129,6 +1204,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
for (int n = 0; n < params.N; n++) {
doReshapeCol(n, params);
}
@@ -1167,6 +1243,7 @@ public class LibMatrixDNN {
private static void runSequentialConvTask(int NSize, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
ConvTask task = new ConvTask(0, NSize, 0, Z, type, params);
+ warnSingleThreaded();
try {
task.call();
} catch (Exception e) {
@@ -1356,6 +1433,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
long nnz = 0;
for (int n = 0; n < params.N; n++) { // Do following for all images
for (int c = 0; c < params.C; c++) { // Since format is NCHW
@@ -1378,6 +1456,7 @@ public class LibMatrixDNN {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) {
+ warnSingleThreaded();
// Sequential col2im
for (int n = 0; n < params.N; n++) { // Do following for all images
for (int c = 0; c < params.C; c++) { // Since format is NCHW