You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/15 09:52:52 UTC
[1/3] systemml git commit: [SYSTEMML-1961] Performance dense max
pooling (init, cases, nnz)
Repository: systemml
Updated Branches:
refs/heads/master 98ee9b7d8 -> 33559144c
[SYSTEMML-1961] Performance dense max pooling (init, cases, nnz)
This patch makes a number of performance improvements to the existing
maxpooling builtin functions:
1) New special cases for (a) stride=1, pad=0, and (b) stride=1, pad=0,
P=1, Q=1, W=1, without materialization of start-end index arrays and
significantly simplified loop structures.
2) Thread local output initialization and nnz maintenance to reduce the
the serial fraction and improve temporal locality for multi-threaded
operations.
On a special case scenario of a 1K x 200*2048 input (~3.3GB) and
stride=1, pad=0, P=1, Q=1, and W=1, this patch improved performance from
253ms to 131ms, which is now at peak memory bandwidth.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/17c5d5aa
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/17c5d5aa
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/17c5d5aa
Branch: refs/heads/master
Commit: 17c5d5aae9669049164f59fccf3820b1c822c83f
Parents: 98ee9b7
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Oct 14 23:38:51 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Oct 14 23:38:51 2017 -0700
----------------------------------------------------------------------
.../cp/ConvolutionCPInstruction.java | 3 -
.../matrix/data/ConvolutionParameters.java | 113 +++++++++++--------
.../sysml/runtime/matrix/data/LibMatrixDNN.java | 33 +++---
.../matrix/data/LibMatrixDNNPoolingHelper.java | 88 ++++++++++-----
.../integration/functions/tensor/PoolTest.java | 110 ++++++++----------
src/test/scripts/functions/tensor/PoolTest.R | 7 +-
src/test/scripts/functions/tensor/PoolTest.dml | 3 +
7 files changed, 193 insertions(+), 164 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f6ff998..2c7b972 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -20,7 +20,6 @@
package org.apache.sysml.runtime.instructions.cp;
import java.util.ArrayList;
-import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -354,8 +353,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction {
}
else {
outputBlock = new MatrixBlock(N, C*P*Q, false).allocateBlock();
- if(instOpcode.equalsIgnoreCase("maxpooling"))
- Arrays.fill(outputBlock.getDenseBlock(), -Double.MAX_VALUE);
LibMatrixDNN.maxpooling(matBlock, outputBlock, params);
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index c66b5c6..d64a261 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -29,11 +29,13 @@ import org.apache.sysml.runtime.util.ConvolutionUtils;
* This class is container that stores parameters required for executing following operations:
* conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, maxpooling_backward
*/
-public class ConvolutionParameters implements Serializable {
+public class ConvolutionParameters implements Serializable
+{
private static final long serialVersionUID = -212362627205772829L;
- public int N; public int C; public int H; public int W;
- public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w;
- public int P; public int Q; public int numThreads;
+
+ public int N, C, H, W, K, R, S, P, Q;
+ public int stride_h, stride_w, pad_h, pad_w;
+ public int numThreads;
// Optional variables used by ConvolutionCPInstruction
public boolean enableNative = false;
@@ -43,52 +45,9 @@ public class ConvolutionParameters implements Serializable {
public MatrixBlock bias;
public int [] start_indexes_h, end_indexes_h, start_indexes_w, end_indexes_w;
- private static int convertToInt(long val) throws DMLRuntimeException {
- if( val > Integer.MAX_VALUE )
- throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val);
- return (int) val;
- }
-
- public boolean compare(ConvolutionParameters that) {
- if(this.N == that.N && this.C == that.C && this.H == that.H && this.W == that.W
- && this.K == that.K && this.R == that.R && this.S == that.S && this.stride_h == that.stride_h
- && this.stride_w == that.stride_w && this.pad_h == that.pad_h
- && this.pad_w == that.pad_w && this.numThreads == that.numThreads) {
- return true;
- }
- return false;
- }
-
- @Override
- public String toString() {
- return "(NCHW=[" + N + " " + C + " " + H + " " + W + "], KCRS=[" + K + " " + R + " " + S + "], stride=[" + stride_h + "," + stride_w +
- "], pad=[" + pad_h + "," + pad_w + "])";
- }
-
- public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
- Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
- if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
- if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
- if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
- if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
- if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
- if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
- if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
- if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
- if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
- if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
- if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
- if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
- this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
- }
- if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
- this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
- }
- this.numThreads = numThreads;
- }
-
public ConvolutionParameters(long N, long C, long H, long W,
- long K, long R, long S, long stride_h, long stride_w, long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
+ long K, long R, long S, long stride_h, long stride_w,
+ long pad_h, long pad_w, int numThreads) throws DMLRuntimeException {
this.N = convertToInt(N);
this.C = convertToInt(C);
this.H = convertToInt(H);
@@ -139,7 +98,63 @@ public class ConvolutionParameters implements Serializable {
this.numThreads = numThreads;
}
+ private static int convertToInt(long val) throws DMLRuntimeException {
+ if( val > Integer.MAX_VALUE )
+ throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val);
+ return (int) val;
+ }
+
+ public boolean compare(ConvolutionParameters that) {
+ if(this.N == that.N && this.C == that.C && this.H == that.H && this.W == that.W
+ && this.K == that.K && this.R == that.R && this.S == that.S && this.stride_h == that.stride_h
+ && this.stride_w == that.stride_w && this.pad_h == that.pad_h
+ && this.pad_w == that.pad_w && this.numThreads == that.numThreads) {
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return "(NCHW=[" + N + " " + C + " " + H + " " + W + "], KCRS=[" + K + " " + R + " " + S + "], stride=[" + stride_h + "," + stride_w +
+ "], pad=[" + pad_h + "," + pad_w + "])";
+ }
+
+ public void setIfUnknown(Hop N, Hop C, Hop H, Hop W,
+ Hop K, Hop R, Hop S, Hop stride_h, Hop stride_w, Hop pad_h, Hop pad_w, int numThreads) throws DMLRuntimeException {
+ if(this.N < 0) this.N = convertToInt(Hop.computeSizeInformation(N));
+ if(this.C < 0) this.C = convertToInt(Hop.computeSizeInformation(C));
+ if(this.H < 0) this.H = convertToInt(Hop.computeSizeInformation(H));
+ if(this.W < 0) this.W = convertToInt(Hop.computeSizeInformation(W));
+ if(this.K < 0) this.K = convertToInt(Hop.computeSizeInformation(K));
+ if(this.R < 0) this.R = convertToInt(Hop.computeSizeInformation(R));
+ if(this.S < 0) this.S = convertToInt(Hop.computeSizeInformation(S));
+ if(this.stride_h < 0) this.stride_h = convertToInt(Hop.computeSizeInformation(stride_h));
+ if(this.stride_w < 0) this.stride_w = convertToInt(Hop.computeSizeInformation(stride_w));
+ if(this.pad_h < 0) this.pad_h = convertToInt(Hop.computeSizeInformation(pad_h));
+ if(this.pad_w < 0) this.pad_w = convertToInt(Hop.computeSizeInformation(pad_w));
+ if(this.P < 0 && this.H >= 0 && this.R >= 0 && this.stride_h >= 0 && this.pad_h >= 0) {
+ this.P = (int) ConvolutionUtils.getP(this.H, this.R, this.stride_h, this.pad_h);
+ }
+ if(this.Q < 0 && this.W >= 0 && this.S >= 0 && this.stride_w >= 0 && this.pad_w >= 0) {
+ this.Q = (int) ConvolutionUtils.getQ(this.W, this.S, this.stride_w, this.pad_w);
+ }
+ this.numThreads = numThreads;
+ }
+
public boolean isOutputThreadSafe() {
return output.isThreadSafe();
}
+
+ public boolean isStride1Pad0() {
+ return (stride_h==1 && stride_w==1
+ && pad_h==0 && pad_w==0);
+ }
+
+ public boolean isAllOnes(Integer...params) {
+ boolean ret = true;
+ for(int param : params)
+ ret &= (param == 1);
+ return ret;
+ }
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 3f67b1a..b967780 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -356,19 +356,15 @@ public class LibMatrixDNN {
params.end_indexes_h = new int[params.P];
params.start_indexes_w = new int[params.Q];
params.end_indexes_w = new int[params.Q];
- for (int p = 0; p < params.P; p++) {
- int start_index_h = p * params.stride_h - params.pad_h;
- int end_index_h = start_index_h + params.R;
+ for( int p=0, ix=-params.pad_h; p < params.P; p++, ix+=params.stride_h ) {
// Note: We do not treat pad as zero
- params.start_indexes_h[p] = Math.max(start_index_h, 0);
- params.end_indexes_h[p] = Math.min(end_index_h, params.H);
+ params.start_indexes_h[p] = Math.max(ix, 0);
+ params.end_indexes_h[p] = Math.min(ix+params.R, params.H);
}
- for (int q = 0; q < params.Q; q++) {
- int start_index_w = q * params.stride_w - params.pad_w;
- int end_index_w = start_index_w + params.S;
+ for( int q=0, ix=-params.pad_w; q < params.Q; q++, ix+=params.stride_w) {
// Note: We do not treat pad as zero
- params.start_indexes_w[q] = Math.max(start_index_w, 0);
- params.end_indexes_w[q] = Math.min(end_index_w, params.W);
+ params.start_indexes_w[q] = Math.max(ix, 0);
+ params.end_indexes_w[q] = Math.min(ix+params.S, params.W);
}
}
@@ -528,21 +524,24 @@ public class LibMatrixDNN {
}
}
- public static void maxpooling(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
+ public static void maxpooling(MatrixBlock input, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
- params.output = outputBlock;
+ params.output = output;
if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
- throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W);
+ throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " "
+ + input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W);
}
- fillIndexesArray(params);
+ //materialize indexes unless basic case with stride=1 and pad=0
+ if( !params.isStride1Pad0() || input.sparse )
+ fillIndexesArray(params);
- execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params);
+ long nnz = execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params);
// post-processing: maintain nnz
- outputBlock.recomputeNonZeros();
- outputBlock.examSparsity();
+ output.setNonZeros(nnz);
+ output.examSparsity();
}
/**
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
index 19c3f71..52cdbcd 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPoolingHelper.java
@@ -31,43 +31,58 @@ public class LibMatrixDNNPoolingHelper {
*/
public static class DenseMaxPooling implements Callable<Long>
{
- public int _rl; public int _ru;
+ private final int _rl, _ru;
private final ConvolutionParameters _params;
- double [] inputArray; double [] outputArray;
- int C; int P; int Q; int W;
+
public DenseMaxPooling(int rl, int ru, ConvolutionParameters params) {
_rl = rl; _ru = ru;
_params = params;
- inputArray = params.input1.getDenseBlock();
- outputArray = params.output.getDenseBlock();
- C = params.C; P = params.P; Q = params.Q; W = params.W;
}
@Override
public Long call() throws Exception {
+ final int C = _params.C, P = _params.P, Q = _params.Q;
+ final int R = _params.R, S = _params.S, H = _params.H, W = _params.W;
final int HW = _params.H*_params.W;
final int CHW = _params.C*_params.H*_params.W;
final int CPQ = C*P*Q;
- for(int n = _rl; n < _ru; n++) {
- final int inOffset = n*CHW;
- int out_index = n*CPQ;
- for (int c = 0; c < C; c++) {
- final int inOffset1 = inOffset + c*HW;
- for (int p = 0; p < P; p++) {
- for (int q = 0; q < Q; q++, out_index++) {
- double tmp = outputArray[out_index];
- for (int h = _params.start_indexes_h[p]; h < _params.end_indexes_h[p]; h++) {
- int inputIndex = inOffset1 + h*W + _params.start_indexes_w[q];
- for (int w = _params.start_indexes_w[q]; w < _params.end_indexes_w[q]; w++, inputIndex++) {
- tmp = Math.max(tmp, inputArray[inputIndex]);
- }
- }
- outputArray[out_index] = tmp;
- }
- }
- }
+ double[] in = _params.input1.getDenseBlock();
+ double[] out = _params.output.getDenseBlock();
+
+ //thread-local initialization of output block
+ if( !(_params.isStride1Pad0() && _params.isAllOnes(P, Q, W)) )
+ Arrays.fill(out, _rl*CPQ, _ru*CPQ, -Double.MAX_VALUE);
+
+ if( _params.isStride1Pad0() && _params.isAllOnes(P, Q, W) ) {
+ //quick-path w/o materialized index arrays and
+ //simplified inner loops for P = 1, Q = 1, W = 1
+ int lenh = Math.min(R,H);
+ for(int i = _rl, oix=_rl*C; i < _ru; i++, oix+=C)
+ for (int c = 0, off=i*CHW; c < C; c++, off+=H)
+ out[oix+c] = max(-Double.MAX_VALUE, in, off, lenh);
+ }
+ else if( _params.isStride1Pad0() ) {
+ //quick-path w/o materialized index arrays
+ for(int i = _rl; i < _ru; i++)
+ for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW)
+ for (int p = 0; p < P; p++, oix+=Q)
+ for (int h = p; h < Math.min(p+R,H); h++)
+ for (int q = 0, off2=off+h*W; q < Q; q++)
+ out[oix+q] = max(out[oix+q], in, off2+q, Math.min(S,W-q));
+ }
+ else { //general case
+ int[] hl = _params.start_indexes_h, hu = _params.end_indexes_h;
+ int[] wl = _params.start_indexes_w, wu = _params.end_indexes_w;
+ for(int i = _rl; i < _ru; i++)
+ for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW)
+ for (int p = 0; p < P; p++, oix+=Q)
+ for (int h = hl[p]; h < hu[p]; h++)
+ for (int q = 0, off2=off+h*W; q < Q; q++)
+ out[oix+q] = max(out[oix+q], in, off2+wl[q], wu[q]-wl[q]);
}
- return 0L;
+
+ //thread-local recomputation of non-zeros
+ return _params.output.recomputeNonZeros(_rl, _ru-1);
}
}
@@ -76,24 +91,26 @@ public class LibMatrixDNNPoolingHelper {
*/
public static class SparseMaxPooling implements Callable<Long>
{
- public int _rl; public int _ru;
+ private final int _rl, _ru;
private final ConvolutionParameters _params;
- final int HW;
- double [] outputArray;
- final int C; final int P; final int Q; final int W; final int H; final int CPQ; final int PQ;
+ private double [] outputArray;
+ private final int C, P, Q, W, H, CPQ, PQ;
+
public SparseMaxPooling(int rl, int ru, ConvolutionParameters params) {
_rl = rl; _ru = ru;
_params = params;
outputArray = params.output.getDenseBlock();
C = params.C; P = params.P; Q = params.Q; H = params.H;
W = params.W;
- HW = _params.H*_params.W;
CPQ = C*P*Q;
PQ = P*Q;
}
@Override
public Long call() throws Exception {
+ //thread-local initialization of output block
+ Arrays.fill(outputArray, _rl *CPQ, _ru*CPQ, -Double.MAX_VALUE);
+
for(int n = _rl; n < _ru; n++) {
if( !_params.input1.sparseBlock.isEmpty(n) ) {
final int apos = _params.input1.sparseBlock.pos(n);
@@ -136,7 +153,16 @@ public class LibMatrixDNNPoolingHelper {
Arrays.fill(outputArray, n*CPQ, (n+1)*CPQ, 0);
}
}
- return 0L;
+
+ //thread-local recomputation of non-zeros
+ return _params.output.recomputeNonZeros(_rl, _ru-1);
}
}
+
+ private static double max(final double aval, double[] b, final int bi, final int len) {
+ double ret = aval;
+ for( int i = bi; i < bi+len; i++ )
+ ret = Math.max(ret, b[i]);
+ return ret;
+ }
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
index e1c84c5..1acb14c 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/PoolTest.java
@@ -51,146 +51,130 @@ public class PoolTest extends AutomatedTestBase
}
@Test
- public void testMaxPool2DDense2()
- {
+ public void testMaxPool2DDense2() {
int numImg = 2; int imgSize = 6; int numChannels = 1; int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
-
@Test
- public void testMaxPool2DDense3()
- {
+ public void testMaxPool2DDense3() {
int numImg = 3; int imgSize = 7; int numChannels = 2; int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
@Test
- public void testMaxPool2DDense4()
- {
+ public void testMaxPool2DDense4() {
int numImg = 2; int imgSize = 4; int numChannels = 2; int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
@Test
- public void testMaxPool2DSparse1()
- {
+ public void testMaxPool2DDense5() {
+ int numImg = 2; int imgSize = 8; int numChannels = 4; int stride = 1; int pad = 0; int poolSize1 = imgSize*imgSize; int poolSize2 = 1;
+ runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max2", false);
+ }
+
+ @Test
+ public void testMaxPool2DSparse1() {
int numImg = 1; int imgSize = 6; int numChannels = 1; int stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
}
@Test
- public void testMaxPool2DSparse2()
- {
+ public void testMaxPool2DSparse2() {
int numImg = 2; int imgSize = 6; int numChannels = 1; int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
}
-
@Test
- public void testMaxPool2DSparse3()
- {
+ public void testMaxPool2DSparse3() {
int numImg = 3; int imgSize = 7; int numChannels = 2; int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
}
@Test
- public void testMaxPool2DSparse4()
- {
+ public void testMaxPool2DSparse4() {
int numImg = 2; int imgSize = 4; int numChannels = 2; int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", true);
}
+ @Test
+ public void testMaxPool2DSparse5() {
+ int numImg = 2; int imgSize = 32; int numChannels = 4; int stride = 1; int pad = 0; int poolSize1 = imgSize*imgSize; int poolSize2 = 1;
+ runPoolTest(ExecType.CP, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max2", true);
+ }
+
// ----------------------------------------
@Test
- public void testMaxPool2DDense1SP()
- {
+ public void testMaxPool2DDense1SP() {
int numImg = 1; int imgSize = 50; int numChannels = 1; int stride = 2; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
@Test
- public void testMaxPool2DDense2SP()
- {
+ public void testMaxPool2DDense2SP() {
int numImg = 2; int imgSize = 6; int numChannels = 1; int stride = 1; int pad = 0; int poolSize1 = 2; int poolSize2 = 2;
runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
-
@Test
- public void testMaxPool2DDense3SP()
- {
+ public void testMaxPool2DDense3SP() {
int numImg = 3; int imgSize = 7; int numChannels = 2; int stride = 2; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
@Test
- public void testMaxPool2DDense4SP()
- {
+ public void testMaxPool2DDense4SP() {
int numImg = 2; int imgSize = 4; int numChannels = 2; int stride = 1; int pad = 0; int poolSize1 = 3; int poolSize2 = 3;
runPoolTest(ExecType.SPARK, imgSize, numImg, numChannels, stride, pad, poolSize1, poolSize2, "max", false);
}
- /**
- *
- * @param et
- * @param sparse
- */
public void runPoolTest( ExecType et, int imgSize, int numImg, int numChannels, int stride,
int pad, int poolSize1, int poolSize2, String poolMode, boolean sparse)
{
- RUNTIME_PLATFORM oldRTP = rtplatform;
-
+ RUNTIME_PLATFORM platformOld = rtplatform;
+ switch( et ){
+ case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+ case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+ default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+ }
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+ if( rtplatform == RUNTIME_PLATFORM.SPARK )
+ DMLScript.USE_LOCAL_SPARK_CONFIG = true;
try
{
- String sparseVal = (""+sparse).toUpperCase();
- TestConfiguration config = getTestConfiguration(TEST_NAME);
- if(et == ExecType.SPARK) {
- rtplatform = RUNTIME_PLATFORM.SPARK;
- }
- else {
- rtplatform = (et==ExecType.MR)? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.SINGLE_NODE;
- }
- if( rtplatform == RUNTIME_PLATFORM.SPARK )
- DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+ String sparseVal = String.valueOf(sparse).toUpperCase();
+ TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
-
- /* This is for running the junit test the new way, i.e., construct the arguments directly */
+
String RI_HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = RI_HOME + TEST_NAME + ".dml";
-
- programArgs = new String[]{"-explain", "-args", "" + imgSize, "" + numImg,
- "" + numChannels, "" + poolSize1, "" + poolSize2,
- "" + stride, "" + pad, poolMode,
- output("B"), sparseVal};
-
- boolean exceptionExpected = false;
- int expectedNumberOfJobs = -1;
- runTest(true, exceptionExpected, null, expectedNumberOfJobs);
+ programArgs = new String[]{"-explain", "-args", String.valueOf(imgSize),
+ String.valueOf(numImg), String.valueOf(numChannels),
+ String.valueOf(poolSize1), String.valueOf(poolSize2),
+ String.valueOf(stride), String.valueOf(pad), poolMode,
+ output("B"), sparseVal};
fullRScriptName = RI_HOME + TEST_NAME + ".R";
rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg +
- " " + numChannels + " " + poolSize1 +
- " " + poolSize2 + " " + stride + " " + pad + " " + expectedDir() + " " + sparseVal;
+ " " + numChannels + " " + poolSize1 + " " + poolSize2 + " " + stride +
+ " " + pad + " " + expectedDir() + " " + sparseVal + " " + poolMode;
- // Run comparison R script
+ // run scripts
+ runTest(true, false, null, -1);
runRScript(true);
- HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
+ //compare results
+ HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B");
HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B");
TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy");
-
}
- finally
- {
- rtplatform = oldRTP;
+ finally {
+ rtplatform = platformOld;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
}
}
-
-
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/scripts/functions/tensor/PoolTest.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.R b/src/test/scripts/functions/tensor/PoolTest.R
index aef0384..fdcba56 100644
--- a/src/test/scripts/functions/tensor/PoolTest.R
+++ b/src/test/scripts/functions/tensor/PoolTest.R
@@ -28,6 +28,7 @@ poolSize1=as.integer(args[4])
poolSize2=as.integer(args[5])
stride=as.integer(args[6])
pad=as.integer(args[7])
+mode=args[10]
# Assumption: NCHW image format
x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize*imgSize, byrow=TRUE)
@@ -98,6 +99,10 @@ max_pool <- function(X, N, C, Hin, Win, Hf, Wf,
out
}
-output = max_pool(x, numImg, numChannels, imgSize, imgSize, poolSize1, poolSize2, stride, stride)
+if( mode=="max" ) {
+ output = max_pool(x, numImg, numChannels, imgSize, imgSize, poolSize1, poolSize2, stride, stride)
+} else {
+ output = max_pool(x, numImg, numChannels, imgSize*imgSize, 1, poolSize1, poolSize2, stride, stride)
+}
writeMM(as(output,"CsparseMatrix"), paste(args[8], "B", sep=""))
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/systemml/blob/17c5d5aa/src/test/scripts/functions/tensor/PoolTest.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/tensor/PoolTest.dml b/src/test/scripts/functions/tensor/PoolTest.dml
index 5246a2d..5f20fae 100644
--- a/src/test/scripts/functions/tensor/PoolTest.dml
+++ b/src/test/scripts/functions/tensor/PoolTest.dml
@@ -39,6 +39,9 @@ else {
if(poolMode == "max") {
output = max_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2])
}
+else if(poolMode == "max2") {
+ output = max_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize*imgSize, 1], pool_size=[poolSize1, poolSize2])
+}
#else {
#output = avg_pool(x, stride=[stride, stride], padding=[pad, pad], input_shape=[numImg, numChannels, imgSize, imgSize], pool_size=[poolSize1, poolSize2])
#}
[3/3] systemml git commit: [HOTFIX][SYSTEMML-1959] Fix sparse-sparse
transpose w/ CSR input
Posted by mb...@apache.org.
[HOTFIX][SYSTEMML-1959] Fix sparse-sparse transpose w/ CSR input
This patch fixes a remaining issue of sparse-sparse transpose related to
the correct handling of sparse blocks in CSR or COO format.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/33559144
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/33559144
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/33559144
Branch: refs/heads/master
Commit: 33559144cd707e324b59ed5ca417e3d5461c2f0a
Parents: a347af3
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:42:20 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:42:20 2017 -0700
----------------------------------------------------------------------
.../sysml/runtime/matrix/data/LibMatrixReorg.java | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/33559144/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 3ae07c5..dd86c27 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -859,8 +859,8 @@ public class LibMatrixReorg
if( cl > 0 ) {
for( int i=bi; i<bimin; i++ )
if( !a.isEmpty(i) ) {
- int pos = a.posFIndexGTE(i, cl);
- ix[i-bi] = (pos>=0) ? pos : a.size(i);
+ int j = a.posFIndexGTE(i, cl);
+ ix[i-bi] = (j>=0) ? j : a.size(i);
}
}
@@ -868,19 +868,19 @@ public class LibMatrixReorg
int bjmin = Math.min(bj+blocksizeJ, cu);
//core block transpose operation
- for( int i=bi, iix=0; i<bimin; i++, iix++ ) {
+ for( int i=bi; i<bimin; i++ ) {
if( a.isEmpty(i) ) continue;
int apos = a.pos(i);
int alen = a.size(i);
int[] aix = a.indexes(i);
double[] avals = a.values(i);
- int j = ix[iix]; //last block boundary
- for( ; j<alen && aix[j]<bjmin; j++ ) {
- c.allocate(aix[apos+j], ennz2,n2);
- c.append(aix[apos+j], i, avals[apos+j]);
+ int j = ix[i-bi] + apos; //last block boundary
+ for( ; j<apos+alen && aix[j]<bjmin; j++ ) {
+ c.allocate(aix[j], ennz2,n2);
+ c.append(aix[j], i, avals[j]);
}
- ix[iix] = j; //keep block boundary
+ ix[i-bi] = j - apos; //keep block boundary
}
}
}
[2/3] systemml git commit: [MINOR] Refactoring lib matrixmult/bincell
(instruction footprint)
Posted by mb...@apache.org.
[MINOR] Refactoring lib matrixmult/bincell (instruction footprint)
This patch makes a minor refactoring of the libraries for matrix
multiplications and binary cell-wise operations in order to reduce the
instruction footprint and simplify JIT compilation. Specifically, the
methods for dense-dense mm, sparse-dense mm, and safe binary operations
have been split into methods for the major individual cases.
On an end-to-end cnn application, this patch reduced the number of
L1-icache misses from 6,055,257,066 to 5,663,272,573 and the number of
iTLB misses from 289,601,812 to 161,268,707.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a347af3b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a347af3b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a347af3b
Branch: refs/heads/master
Commit: a347af3b7b488ac3b296b9b9692f7172d60ac6f5
Parents: 17c5d5a
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Oct 15 02:22:55 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Oct 15 02:22:55 2017 -0700
----------------------------------------------------------------------
.../runtime/matrix/data/LibMatrixBincell.java | 446 +++++++------
.../runtime/matrix/data/LibMatrixMult.java | 665 ++++++++++---------
2 files changed, 580 insertions(+), 531 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 7622137..2878b3b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -210,12 +210,9 @@ public class LibMatrixBincell
{
return;
}
-
- int rlen = m1.rlen;
- int clen = m1.clen;
- BinaryAccessType atype = getBinaryAccessType(m1, m2);
- if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
+ BinaryAccessType atype = getBinaryAccessType(m1, m2);
+ if( atype == BinaryAccessType.MATRIX_COL_VECTOR //MATRIX - VECTOR
|| atype == BinaryAccessType.MATRIX_ROW_VECTOR)
{
//note: m2 vector and hence always dense
@@ -232,213 +229,24 @@ public class LibMatrixBincell
}
else //MATRIX - MATRIX
{
- if(m1.sparse && m2.sparse)
- {
- if(ret.sparse)
- ret.allocateSparseRowsBlock();
-
- //both sparse blocks existing
- if(m1.sparseBlock!=null && m2.sparseBlock!=null)
- {
- SparseBlock lsblock = m1.sparseBlock;
- SparseBlock rsblock = m2.sparseBlock;
-
- if( ret.sparse && lsblock.isAligned(rsblock) )
- {
- SparseBlock c = ret.sparseBlock;
- for(int r=0; r<rlen; r++)
- if( !lsblock.isEmpty(r) ) {
- int alen = lsblock.size(r);
- int apos = lsblock.pos(r);
- int[] aix = lsblock.indexes(r);
- double[] avals = lsblock.values(r);
- double[] bvals = rsblock.values(r);
- c.allocate(r, alen);
- for( int j=apos; j<apos+alen; j++ ) {
- double tmp = op.fn.execute(avals[j], bvals[j]);
- c.append(r, aix[j], tmp);
- }
- ret.nonZeros += c.size(r);
- }
- }
- else //general case
- {
- for(int r=0; r<rlen; r++)
- {
- if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
- mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
- rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
- }
- else if( !rsblock.isEmpty(r) ) {
- appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
- rsblock.pos(r), rsblock.size(r), 0, r, ret);
- }
- else if( !lsblock.isEmpty(r) ){
- appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
- lsblock.pos(r), lsblock.size(r), 0, r, ret);
- }
- // do nothing if both not existing
- }
- }
- }
- //right sparse block existing
- else if( m2.sparseBlock!=null )
- {
- SparseBlock rsblock = m2.sparseBlock;
-
- for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++)
- if( !rsblock.isEmpty(r) )
- {
- appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
- rsblock.pos(r), rsblock.size(r), 0, r, ret);
- }
- }
- //left sparse block existing
- else
- {
- SparseBlock lsblock = m1.sparseBlock;
-
- for(int r=0; r<rlen; r++)
- if( !lsblock.isEmpty(r) )
- {
- appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
- lsblock.pos(r), lsblock.size(r), 0, r, ret);
- }
- }
+ if(m1.sparse && m2.sparse) {
+ safeBinaryMMSparseSparse(m1, m2, ret, op);
}
else if( !ret.sparse && (m1.sparse || m2.sparse) &&
- (op.fn instanceof Plus || op.fn instanceof Minus ||
- op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
- (op.fn instanceof Multiply && !m2.sparse )))
- {
- //specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
- ret.allocateDenseBlock();
- final int m = ret.rlen;
- final int n = ret.clen;
- double[] c = ret.denseBlock;
-
- //1) process left input: assignment
-
- if( m1.sparse ) //SPARSE left
- {
- if( m1.sparseBlock != null )
- {
- SparseBlock a = m1.sparseBlock;
-
- for( int i=0, ix=0; i<m; i++, ix+=n ) {
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- for(int k = apos; k < apos+alen; k++)
- c[ix+aix[k]] = avals[k];
- }
- }
- }
- }
- else //DENSE left
- {
- if( !m1.isEmptyBlock(false) )
- System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
- else
- Arrays.fill(ret.denseBlock, 0, m*n, 0);
- }
-
- //2) process right input: op.fn (+,-,*), * only if dense
- long lnnz = 0;
- if( m2.sparse ) //SPARSE right
- {
- if(m2.sparseBlock!=null)
- {
- SparseBlock a = m2.sparseBlock;
-
- for( int i=0, ix=0; i<m; i++, ix+=n ) {
- if( !a.isEmpty(i) ) {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- for(int k = apos; k < apos+alen; k++)
- c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
- }
- //exploit temporal locality of rows
- lnnz += ret.recomputeNonZeros(i, i, 0, clen-1);
- }
- }
- }
- else //DENSE right
- {
- if( !m2.isEmptyBlock(false) ) {
- double[] a = m2.denseBlock;
- for( int i=0; i<m*n; i++ ) {
- c[i] = op.fn.execute(c[i], a[i]);
- lnnz += (c[i]!=0) ? 1 : 0;
- }
- }
- else if(op.fn instanceof Multiply)
- Arrays.fill(ret.denseBlock, 0, m*n, 0);
- }
-
- //3) recompute nnz
- ret.setNonZeros(lnnz);
+ (op.fn instanceof Plus || op.fn instanceof Minus ||
+ op.fn instanceof PlusMultiply || op.fn instanceof MinusMultiply ||
+ (op.fn instanceof Multiply && !m2.sparse ))) {
+ safeBinaryMMSparseDenseDense(m1, m2, ret, op);
}
else if( !ret.sparse && !m1.sparse && !m2.sparse
- && m1.denseBlock!=null && m2.denseBlock!=null )
- {
- ret.allocateDenseBlock();
- final int m = ret.rlen;
- final int n = ret.clen;
- double[] a = m1.denseBlock;
- double[] b = m2.denseBlock;
- double[] c = ret.denseBlock;
- ValueFunction fn = op.fn;
-
- //compute dense-dense binary, maintain nnz on-the-fly
- int lnnz = 0;
- for( int i=0; i<m*n; i++ ) {
- c[i] = fn.execute(a[i], b[i]);
- lnnz += (c[i]!=0)? 1 : 0;
- }
- ret.setNonZeros(lnnz);
+ && m1.denseBlock!=null && m2.denseBlock!=null ) {
+ safeBinaryMMDenseDenseDense(m1, m2, ret, op);
}
- else if( skipEmpty && (m1.sparse || m2.sparse) )
- {
- SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
- if( a == null )
- return;
-
- //prepare second input and allocate output
- MatrixBlock b = m1.sparse ? m2 : m1;
- ret.allocateBlock();
-
- for( int i=0; i<a.numRows(); i++ ) {
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- if( ret.sparse && !b.sparse )
- ret.sparseBlock.allocate(i, alen);
- for(int k = apos; k < apos+alen; k++) {
- double in2 = b.quickGetValue(i, aix[k]);
- if( in2==0 ) continue;
- double val = op.fn.execute(avals[k], in2);
- ret.appendValue(i, aix[k], val);
- }
- }
+ else if( skipEmpty && (m1.sparse || m2.sparse) ) {
+ safeBinaryMMSparseDenseSkip(m1, m2, ret, op);
}
- else //generic case
- {
- for(int r=0; r<rlen; r++)
- for(int c=0; c<clen; c++) {
- double in1 = m1.quickGetValue(r, c);
- double in2 = m2.quickGetValue(r, c);
- if( in1==0 && in2==0) continue;
- double val = op.fn.execute(in1, in2);
- ret.appendValue(r, c, val);
- }
+ else { //generic case
+ safeBinaryMMGeneric(m1, m2, ret, op);
}
}
}
@@ -694,7 +502,7 @@ public class LibMatrixBincell
for( int j=0; j<blen; j++ ) {
double v1 = m1.quickGetValue(i, bix[j]);
double v = op.fn.execute( v1, bvals[j] );
- ret.appendValue(i, bix[j], v);
+ ret.appendValue(i, bix[j], v);
}
}
}
@@ -731,19 +539,231 @@ public class LibMatrixBincell
}
else {
for(int r=0; r<rlen; r++) {
- double v1 = m1.quickGetValue(r, 0);
+ double v1 = m1.quickGetValue(r, 0);
for(int c=0; c<clen; c++)
{
double v2 = m2.quickGetValue(0, c);
double v = op.fn.execute( v1, v2 );
- ret.appendValue(r, c, v);
+ ret.appendValue(r, c, v);
}
- }
- }
-
+ }
+ }
+
//no need to recomputeNonZeros since maintained in append value
}
+ private static void safeBinaryMMSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ final int rlen = m1.rlen;
+ if(ret.sparse)
+ ret.allocateSparseRowsBlock();
+
+ //both sparse blocks existing
+ if(m1.sparseBlock!=null && m2.sparseBlock!=null)
+ {
+ SparseBlock lsblock = m1.sparseBlock;
+ SparseBlock rsblock = m2.sparseBlock;
+
+ if( ret.sparse && lsblock.isAligned(rsblock) )
+ {
+ SparseBlock c = ret.sparseBlock;
+ for(int r=0; r<rlen; r++)
+ if( !lsblock.isEmpty(r) ) {
+ int alen = lsblock.size(r);
+ int apos = lsblock.pos(r);
+ int[] aix = lsblock.indexes(r);
+ double[] avals = lsblock.values(r);
+ double[] bvals = rsblock.values(r);
+ c.allocate(r, alen);
+ for( int j=apos; j<apos+alen; j++ ) {
+ double tmp = op.fn.execute(avals[j], bvals[j]);
+ c.append(r, aix[j], tmp);
+ }
+ ret.nonZeros += c.size(r);
+ }
+ }
+ else //general case
+ {
+ for(int r=0; r<rlen; r++) {
+ if( !lsblock.isEmpty(r) && !rsblock.isEmpty(r) ) {
+ mergeForSparseBinary(op, lsblock.values(r), lsblock.indexes(r), lsblock.pos(r), lsblock.size(r),
+ rsblock.values(r), rsblock.indexes(r), rsblock.pos(r), rsblock.size(r), r, ret);
+ }
+ else if( !rsblock.isEmpty(r) ) {
+ appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
+ rsblock.pos(r), rsblock.size(r), 0, r, ret);
+ }
+ else if( !lsblock.isEmpty(r) ){
+ appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
+ lsblock.pos(r), lsblock.size(r), 0, r, ret);
+ }
+ // do nothing if both not existing
+ }
+ }
+ }
+ //right sparse block existing
+ else if( m2.sparseBlock!=null )
+ {
+ SparseBlock rsblock = m2.sparseBlock;
+ for(int r=0; r<Math.min(rlen, rsblock.numRows()); r++) {
+ if( rsblock.isEmpty(r) ) continue;
+ appendRightForSparseBinary(op, rsblock.values(r), rsblock.indexes(r),
+ rsblock.pos(r), rsblock.size(r), 0, r, ret);
+ }
+ }
+ //left sparse block existing
+ else
+ {
+ SparseBlock lsblock = m1.sparseBlock;
+ for(int r=0; r<rlen; r++) {
+ if( lsblock.isEmpty(r) ) continue;
+ appendLeftForSparseBinary(op, lsblock.values(r), lsblock.indexes(r),
+ lsblock.pos(r), lsblock.size(r), 0, r, ret);
+ }
+ }
+ }
+
+ private static void safeBinaryMMSparseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ //specific case in order to prevent binary search on sparse inputs (see quickget and quickset)
+ ret.allocateDenseBlock();
+ final int m = ret.rlen;
+ final int n = ret.clen;
+ double[] c = ret.denseBlock;
+
+ //1) process left input: assignment
+
+ if( m1.sparse ) //SPARSE left
+ {
+ if( m1.sparseBlock != null )
+ {
+ SparseBlock a = m1.sparseBlock;
+
+ for( int i=0, ix=0; i<m; i++, ix+=n ) {
+ if( !a.isEmpty(i) )
+ {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++)
+ c[ix+aix[k]] = avals[k];
+ }
+ }
+ }
+ }
+ else //DENSE left
+ {
+ if( !m1.isEmptyBlock(false) )
+ System.arraycopy(m1.denseBlock, 0, c, 0, m*n);
+ else
+ Arrays.fill(ret.denseBlock, 0, m*n, 0);
+ }
+
+ //2) process right input: op.fn (+,-,*), * only if dense
+ long lnnz = 0;
+ if( m2.sparse ) //SPARSE right
+ {
+ if(m2.sparseBlock!=null)
+ {
+ SparseBlock a = m2.sparseBlock;
+
+ for( int i=0, ix=0; i<m; i++, ix+=n ) {
+ if( !a.isEmpty(i) ) {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++)
+ c[ix+aix[k]] = op.fn.execute(c[ix+aix[k]], avals[k]);
+ }
+ //exploit temporal locality of rows
+ lnnz += ret.recomputeNonZeros(i, i, 0, n-1);
+ }
+ }
+ }
+ else //DENSE right
+ {
+ if( !m2.isEmptyBlock(false) ) {
+ double[] a = m2.denseBlock;
+ for( int i=0; i<m*n; i++ ) {
+ c[i] = op.fn.execute(c[i], a[i]);
+ lnnz += (c[i]!=0) ? 1 : 0;
+ }
+ }
+ else if(op.fn instanceof Multiply)
+ Arrays.fill(ret.denseBlock, 0, m*n, 0);
+ }
+
+ //3) recompute nnz
+ ret.setNonZeros(lnnz);
+ }
+
+ private static void safeBinaryMMDenseDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ ret.allocateDenseBlock();
+ final int m = ret.rlen;
+ final int n = ret.clen;
+ double[] a = m1.denseBlock;
+ double[] b = m2.denseBlock;
+ double[] c = ret.denseBlock;
+ ValueFunction fn = op.fn;
+
+ //compute dense-dense binary, maintain nnz on-the-fly
+ int lnnz = 0;
+ for( int i=0; i<m*n; i++ ) {
+ c[i] = fn.execute(a[i], b[i]);
+ lnnz += (c[i]!=0)? 1 : 0;
+ }
+ ret.setNonZeros(lnnz);
+ }
+
+ private static void safeBinaryMMSparseDenseSkip(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ SparseBlock a = m1.sparse ? m1.sparseBlock : m2.sparseBlock;
+ if( a == null )
+ return;
+
+ //prepare second input and allocate output
+ MatrixBlock b = m1.sparse ? m2 : m1;
+ ret.allocateBlock();
+
+ for( int i=0; i<a.numRows(); i++ ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ if( ret.sparse && !b.sparse )
+ ret.sparseBlock.allocate(i, alen);
+ for(int k = apos; k < apos+alen; k++) {
+ double in2 = b.quickGetValue(i, aix[k]);
+ if( in2==0 ) continue;
+ double val = op.fn.execute(avals[k], in2);
+ ret.appendValue(i, aix[k], val);
+ }
+ }
+ }
+
+ private static void safeBinaryMMGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op)
+ throws DMLRuntimeException
+ {
+ int rlen = m1.rlen;
+ int clen = m2.clen;
+ for(int r=0; r<rlen; r++)
+ for(int c=0; c<clen; c++) {
+ double in1 = m1.quickGetValue(r, c);
+ double in2 = m2.quickGetValue(r, c);
+ if( in1==0 && in2==0) continue;
+ double val = op.fn.execute(in1, in2);
+ ret.appendValue(r, c, val);
+ }
+ }
+
/**
*
* This will do cell wise operation for <, <=, >, >=, == and != operators.
@@ -1254,6 +1274,4 @@ public class LibMatrixBincell
result.appendValue(resultRow, cols2[j], v);
}
}
-
}
-
http://git-wip-us.apache.org/repos/asf/systemml/blob/a347af3b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index fee73c5..eca26f6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -632,7 +632,7 @@ public class LibMatrixMult
ret.allocateBlock();
try
- {
+ {
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<>();
int blklen = (int)(Math.ceil((double)mW.rlen/k));
@@ -927,169 +927,189 @@ public class LibMatrixMult
private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
- {
+ {
double[] a = m1.denseBlock;
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
final int n = m2.clen;
final int cd = m1.clen;
-
- if( LOW_LEVEL_OPTIMIZATION )
- {
- if( m==1 && n==1 ) //DOT PRODUCT
- {
+
+ if( LOW_LEVEL_OPTIMIZATION ) {
+ if( m==1 && n==1 ) { //DOT PRODUCT
c[0] = dotProduct(a, b, cd);
}
- else if( n>1 && cd == 1 ) //OUTER PRODUCT
- {
+ else if( n>1 && cd == 1 ) { //OUTER PRODUCT
for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
if( a[i] == 1 )
System.arraycopy(b, 0, c, cix, n);
- else if( a[i] != 0 )
+ else if( a[i] != 0 )
vectMultiplyWrite(a[i], b, c, 0, cix, n);
else
Arrays.fill(c, cix, cix+n, 0);
}
}
- else if( n==1 && cd == 1 ) //VECTOR-SCALAR
- {
+ else if( n==1 && cd == 1 ) { //VECTOR-SCALAR
vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
}
- else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
- {
- for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
- c[i] = dotProduct(a, b, aix, 0, cd);
+ else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+ matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
}
- else if( n==1 ) //MATRIX-VECTOR (tall rhs)
- {
- final int blocksizeI = 32;
- final int blocksizeK = 2*1024; //16KB vector blocks (L1)
- for( int bi=rl; bi<ru; bi+=blocksizeI ) {
- int bimin = Math.min(bi+blocksizeI, ru);
- for( int bk=0; bk<cd; bk+=blocksizeK ) {
- int bkmin = Math.min(bk+blocksizeK, cd);
- for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd)
- c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
- }
- }
+ else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
+ matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
}
- else if( pm2 && m==1 ) //VECTOR-MATRIX
- {
- //parallelization over rows in rhs matrix
- //rest not aligned to blocks of 2 rows
- final int kn = (ru-rl)%2;
- if( kn == 1 && a[rl] != 0 )
- vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-
- //compute blocks of 2 rows (2 instead of 4 for small n<64)
- for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
- if( a[k] != 0 && a[k+1] != 0 )
- vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
- else if( a[k] != 0 )
- vectMultiplyAdd(a[k], b, c, bix, 0, n);
- else if( a[k+1] != 0 )
- vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
- }
+ else if( pm2 && m==1 ) { //VECTOR-MATRIX
+ matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
}
- else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
- {
- //cache-conscious parallelization over rows in rhs matrix
- final int kn = (ru-rl)%4;
-
- //rest not aligned to blocks of 2 rows
- for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
- for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
- if( a[aix+k] != 0 )
- vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
-
- final int blocksizeK = 48;
- final int blocksizeJ = 1024;
-
- //blocked execution
- for( int bk = rl+kn; bk < ru; bk+=blocksizeK )
- for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ )
- {
- //compute blocks of 4 rows in rhs w/ IKJ
- int bjlen = Math.min(n, bj+blocksizeJ)-bj;
- for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
- for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
- vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
- b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
- }
- }
+ else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs)
+ matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
}
- else if( tm2 ) //MATRIX-MATRIX (skinny rhs)
- {
- //note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
- //however, explicit flag required since dimension change m2
- final int n2 = m2.rlen;
- for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 )
- for( int j=0, bix=0; j<n2; j++, bix+=cd )
- c[cix+j] = dotProduct(a, b, aix, bix, cd);
- }
- else //MATRIX-MATRIX
- {
- //1) Unrolled inner loop (for better instruction-level parallelism)
- //2) Blocked execution (for less cache trashing in parallel exec)
- //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-
- final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
- final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
- final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
-
- //temporary arrays (nnz a, b index)
- double[] ta = new double[ blocksizeK ];
- int[] tbi = new int[ blocksizeK ];
-
- //blocked execution
- for( int bi = rl; bi < ru; bi+=blocksizeI )
- for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
- for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ )
- {
- int bklen = bkmin-bk;
- int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
-
- //core sub block matrix multiplication
- for( int i = bi; i < bimin; i++)
- {
- int aixi = i * cd + bk; //start index on a
- int cixj = i * n + bj; //scan index on c
-
- //determine nnz of a (for sparsity-aware skipping of rows)
- int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
- //if( knnz > 0 ) //for skipping empty rows
-
- //rest not aligned to blocks of 4 rows
- final int bn = knnz % 4;
- switch( bn ){
- case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
- case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
- case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
- }
-
- //compute blocks of 4 rows (core inner loop)
- for( int k = bn; k<knnz; k+=4 ){
- vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c,
- tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
- }
- }
- }
+ else if( tm2 ) { //MATRIX-MATRIX (skinny rhs)
+ matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
+ }
+ else { //MATRIX-MATRIX
+ matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
}
}
- else
- {
- double val;
+ else {
for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n)
- for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
- {
- val = a[ aix ];
+ for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n) {
+ double val = a[ aix ];
if( val != 0 )
for( int j = 0; j < n; j++)
c[ cix+j ] += val * b[ bix+j ];
- }
+ }
}
+ }
+
+ private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
+ c[i] = dotProduct(a, b, aix, 0, cd);
+ }
+
+ private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] b, double[] c, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ final int blocksizeI = 32;
+ final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+ for( int bi=rl; bi<ru; bi+=blocksizeI ) {
+ int bimin = Math.min(bi+blocksizeI, ru);
+ for( int bk=0; bk<cd; bk+=blocksizeK ) {
+ int bkmin = Math.min(bk+blocksizeK, cd);
+ for( int i=bi, aix=bi*cd+bk; i<bimin; i++, aix+=cd)
+ c[i] += dotProduct(a, b, aix, bk, bkmin-bk);
+ }
+ }
+ }
+
+ private static void matrixMultDenseDenseVM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //parallelization over rows in rhs matrix
+ //rest not aligned to blocks of 2 rows
+ final int kn = (ru-rl)%2;
+ if( kn == 1 && a[rl] != 0 )
+ vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
+ //compute blocks of 2 rows (2 instead of 4 for small n<64)
+ for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
+ if( a[k] != 0 && a[k+1] != 0 )
+ vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
+ else if( a[k] != 0 )
+ vectMultiplyAdd(a[k], b, c, bix, 0, n);
+ else if( a[k+1] != 0 )
+ vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+ }
+ }
+
+ private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] b, double[] c, int m, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //cache-conscious parallelization over rows in rhs matrix
+ final int kn = (ru-rl)%4;
+
+ //rest not aligned to blocks of 2 rows
+ for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
+ for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
+ if( a[aix+k] != 0 )
+ vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
+
+ final int blocksizeK = 48;
+ final int blocksizeJ = 1024;
+
+ //blocked execution
+ for( int bk = rl+kn; bk < ru; bk+=blocksizeK )
+ for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+ //compute blocks of 4 rows in rhs w/ IKJ
+ int bjlen = Math.min(n, bj+blocksizeJ)-bj;
+ for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n )
+ for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) {
+ vectMultiplyAdd4(a[aix+k], a[aix+k+1], a[aix+k+2], a[aix+k+3],
+ b, c, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen);
+ }
+ }
+ }
+
+ private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, double[] b, double[] c, int n2, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
+ //however, explicit flag required since dimension change m2
+ for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 )
+ for( int j=0, bix=0; j<n2; j++, bix+=cd )
+ c[cix+j] = dotProduct(a, b, aix, bix, cd);
+ }
+
+ private static void matrixMultDenseDenseMM(double[] a, double[] b, double[] c, int n, int cd, int rl, int ru, int cl, int cu)
+ throws DMLRuntimeException
+ {
+ //1) Unrolled inner loop (for better instruction-level parallelism)
+ //2) Blocked execution (for less cache trashing in parallel exec)
+ //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
+
+ final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
+ final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
+ final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
+
+ //temporary arrays (nnz a, b index)
+ double[] ta = new double[ blocksizeK ];
+ int[] tbi = new int[ blocksizeK ];
+
+ //blocked execution
+ for( int bi = rl; bi < ru; bi+=blocksizeI )
+ for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
+ for( int bj = cl, bkmin = Math.min(cd, bk+blocksizeK); bj < cu; bj+=blocksizeJ )
+ {
+ int bklen = bkmin-bk;
+ int bjlen = Math.min(cu, bj+blocksizeJ)-bj;
+
+ //core sub block matrix multiplication
+ for( int i = bi; i < bimin; i++)
+ {
+ int aixi = i * cd + bk; //start index on a
+ int cixj = i * n + bj; //scan index on c
+
+ //determine nnz of a (for sparsity-aware skipping of rows)
+ int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+ //if( knnz > 0 ) //for skipping empty rows
+
+ //rest not aligned to blocks of 4 rows
+ final int bn = knnz % 4;
+ switch( bn ){
+ case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
+ case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
+ case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
+ }
+
+ //compute blocks of 4 rows (core inner loop)
+ for( int k = bn; k<knnz; k+=4 ){
+ vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c,
+ tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+ }
+ }
+ }
}
private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
@@ -1163,7 +1183,8 @@ public class LibMatrixMult
private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
throws DMLRuntimeException
- {
+ {
+ SparseBlock a = m1.sparseBlock;
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
@@ -1171,179 +1192,195 @@ public class LibMatrixMult
final int cd = m2.rlen;
final long xsp = (long)m*cd/m1.nonZeros;
- if( LOW_LEVEL_OPTIMIZATION )
- {
- SparseBlock a = m1.sparseBlock;
-
- if( m==1 && n==1 ) //DOT PRODUCT
- {
- if( !a.isEmpty(0) ) {
+ if( LOW_LEVEL_OPTIMIZATION ) {
+ if( m==1 && n==1 ) { //DOT PRODUCT
+ if( !a.isEmpty(0) )
c[0] = dotProduct(a.values(0), b, a.indexes(0), a.pos(0), 0, a.size(0));
- }
}
- else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short rhs)
- {
- for( int i=rl; i<ru; i++ )
- if( !a.isEmpty(i) )
- c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+ else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
+ matrixMultSparseDenseMVShortRHS(a, b, c, rl, ru);
}
- else if( n==1 ) //MATRIX-VECTOR (tall rhs)
- {
- final int blocksizeI = 32;
- final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
- int[] curk = new int[blocksizeI];
-
- for( int bi = rl; bi < ru; bi+=blocksizeI ) {
- Arrays.fill(curk, 0); //reset positions
- for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
- for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
- if( a.isEmpty(i) ) continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- int k = curk[i-bi] + apos;
- for( ; k<apos+alen && aix[k]<bkmin; k++ )
- c[i] += avals[k] * b[aix[k]];
- curk[i-bi] = k - apos;
- }
- }
- }
+ else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
+ matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
}
- else if( pm2 && m==1 ) //VECTOR-MATRIX
- {
- //parallelization over rows in rhs matrix
- if( !a.isEmpty(0) )
- {
- int alen = a.size(0);
- int[] aix = a.indexes(0);
- double[] avals = a.values(0);
- int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
- rlix = (rlix>=0) ? rlix : alen;
-
- for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
- if( k+1<alen && aix[k+1]<ru )
- vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
- else
- vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
- }
- }
+ else if( pm2 && m==1 ) { //VECTOR-MATRIX
+ matrixMultSparseDenseVM(a, b, c, n, rl, ru);
}
- else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
- {
- int arlen = a.numRows();
- for( int i=0, cix=0; i<arlen; i++, cix+=n )
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
- k1 = (k1>=0) ? apos+k1 : apos+alen;
- int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
- k2 = (k2>=0) ? apos+k2 : apos+alen;
-
- //rest not aligned to blocks of 4 rows
- final int bn = (k2-k1) % 4;
- switch( bn ){
- case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
- case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
- case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
- }
-
- //compute blocks of 4 rows (core inner loop)
- for( int k = k1+bn; k<k2; k+=4 ) {
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
- }
- }
+ else if( pm2 && m<=16 ) { //MATRIX-MATRIX (short lhs)
+ matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
}
- else if( n<=64 ) //MATRIX-MATRIX (skinny rhs)
- {
- //no blocking since b and c fit into cache anyway
- for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
- if( a.isEmpty(i) )
- continue;
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
- //rest not aligned to blocks of 4 rows
- int bn = alen%4;
- for( int k=apos; k<apos+bn; k++ )
- vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
- //compute blocks of 4 rows (core inner loop)
- for( int k=apos+bn; k<apos+alen; k+=4 )
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
- }
+ else if( n<=64 ) { //MATRIX-MATRIX (skinny rhs)
+ matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
}
- else //MATRIX-MATRIX
- {
- //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size,
- //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
- //in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
- //output - however, in this case it's unlikely that we consume every cache line in the rhs
-
- final int blocksizeI = (int) (8L*m*cd/m1.nonZeros);
- final int blocksizeK = (int) (8L*m*cd/m1.nonZeros);
- final int blocksizeJ = 1024;
-
- //temporary array of current sparse positions
- int[] curk = new int[blocksizeI];
-
- //blocked execution over IKJ
- for( int bi = rl; bi < ru; bi+=blocksizeI ) {
- Arrays.fill(curk, 0); //reset positions
- for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
- for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
- int bjlen = Math.min(n, bj+blocksizeJ)-bj;
-
- //core sub block matrix multiplication
- for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
- if( !a.isEmpty(i) ) {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- int k = curk[i-bi] + apos;
- //rest not aligned to blocks of 4 rows
- int bn = alen%4;
- for( ; k<apos+bn && aix[k]<bkmin; k++ )
- vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen);
- //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
- for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
- vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
- aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
- //update positions on last bj block
- if( bj+bjlen==n )
- curk[i-bi] = k - apos;
- }
- }
- }
- }
+ else { //MATRIX-MATRIX
+ matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
+ }
+ }
+ else {
+ for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ for(int k = apos; k < apos+alen; k++) {
+ double val = avals[k];
+ for(int j = 0, bix=aix[k]*n; j < n; j++)
+ c[cix+j] += val * b[bix+j];
}
}
}
- else
- {
- SparseBlock a = m1.sparseBlock;
- for( int i=rl, cix=rl*n; i<ru; i++, cix+=n )
- {
- if( !a.isEmpty(i) )
- {
+ }
+
+ private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, double[] b, double[] c, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ for( int i=rl; i<ru; i++ )
+ if( !a.isEmpty(i) )
+ c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
+ }
+
+ private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, double[] b, double[] c, int cd, long xsp, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ final int blocksizeI = 32;
+ final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1
+ int[] curk = new int[blocksizeI];
+
+ for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+ Arrays.fill(curk, 0); //reset positions
+ for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+ for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+ if( a.isEmpty(i) ) continue;
int apos = a.pos(i);
int alen = a.size(i);
int[] aix = a.indexes(i);
double[] avals = a.values(i);
+ int k = curk[i-bi] + apos;
+ for( ; k<apos+alen && aix[k]<bkmin; k++ )
+ c[i] += avals[k] * b[aix[k]];
+ curk[i-bi] = k - apos;
+ }
+ }
+ }
+ }
+
+ private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, double[] c, int n, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ if( a.isEmpty(0) )
+ return;
+
+ //parallelization over rows in rhs matrix
+ int alen = a.size(0);
+ int[] aix = a.indexes(0);
+ double[] avals = a.values(0);
+ int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
+ rlix = (rlix>=0) ? rlix : alen;
+
+ for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
+ if( k+1<alen && aix[k+1]<ru )
+ vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
+ else
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+ }
+ }
+
+ private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, double[] b, double[] c, int n, int cd, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ int arlen = a.numRows();
+ for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl);
+ k1 = (k1>=0) ? apos+k1 : apos+alen;
+ int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru);
+ k2 = (k2>=0) ? apos+k2 : apos+alen;
+
+ //rest not aligned to blocks of 4 rows
+ final int bn = (k2-k1) % 4;
+ switch( bn ){
+ case 1: vectMultiplyAdd(avals[k1], b, c, aix[k1]*n, cix, n); break;
+ case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+ case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+ }
+
+ //compute blocks of 4 rows (core inner loop)
+ for( int k = k1+bn; k<k2; k+=4 ) {
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+ }
+ }
+ }
+
+ private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, double[] b, double[] c, int n, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //no blocking since b and c fit into cache anyway
+ for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+ //rest not aligned to blocks of 4 rows
+ int bn = alen%4;
+ for( int k=apos; k<apos+bn; k++ )
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, n);
+ //compute blocks of 4 rows (core inner loop)
+ for( int k=apos+bn; k<apos+alen; k+=4 )
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
+ }
+ }
+
+ private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, double[] c, int n, int cd, long xsp, int rl, int ru)
+ throws DMLRuntimeException
+ {
+ //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size,
+ //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
+ //in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
+ //output - however, in this case it's unlikely that we consume every cache line in the rhs
+ final int blocksizeI = (int) (8L*xsp);
+ final int blocksizeK = (int) (8L*xsp);
+ final int blocksizeJ = 1024;
+
+ //temporary array of current sparse positions
+ int[] curk = new int[blocksizeI];
+
+ //blocked execution over IKJ
+ for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+ Arrays.fill(curk, 0); //reset positions
+ for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+ for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ ) {
+ int bjlen = Math.min(n, bj+blocksizeJ)-bj;
- for(int k = apos; k < apos+alen; k++) {
- double val = avals[k];
- for(int j = 0, bix=aix[k]*n; j < n; j++)
- c[cix+j] += val * b[bix+j];
+ //core sub block matrix multiplication
+ for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) {
+ if( !a.isEmpty(i) ) {
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ int k = curk[i-bi] + apos;
+ //rest not aligned to blocks of 4 rows
+ int bn = alen%4;
+ for( ; k<apos+bn && aix[k]<bkmin; k++ )
+ vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen);
+ //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin
+ for( ; k<apos+alen && aix[k]<bkmin; k+=4 )
+ vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
+ aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
+ //update positions on last bj block
+ if( bj+bjlen==n )
+ curk[i-bi] = k - apos;
+ }
}
}
}
@@ -1408,8 +1445,8 @@ public class LibMatrixMult
int[] aix = a.indexes(i);
double[] avals = a.values(i);
- int k = curk[i-bi] + apos;
- for(; k < apos+alen && aix[k]<bkmin; k++) {
+ int k = curk[i-bi] + apos;
+ for(; k < apos+alen && aix[k]<bkmin; k++) {
if( !b.isEmpty(aix[k]) )
vectMultiplyAdd(avals[k], b.values(aix[k]), c,
b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
@@ -1423,28 +1460,22 @@ public class LibMatrixMult
}
else
{
- for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n )
- {
- if( !a.isEmpty(i) )
- {
- int apos = a.pos(i);
- int alen = a.size(i);
- int[] aix = a.indexes(i);
- double[] avals = a.values(i);
-
- for(int k = apos; k < apos+alen; k++)
- {
- double val = avals[k];
- if( !b.isEmpty(aix[k]) )
- {
- int bpos = b.pos(aix[k]);
- int blen = b.size(aix[k]);
- int[] bix = b.indexes(aix[k]);
- double[] bvals = b.values(aix[k]);
- for(int j = bpos; j < bpos+blen; j++)
- c[cix+bix[j]] += val * bvals[j];
- }
- }
+ for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); i++, cix+=n ) {
+ if( a.isEmpty(i) ) continue;
+ int apos = a.pos(i);
+ int alen = a.size(i);
+ int[] aix = a.indexes(i);
+ double[] avals = a.values(i);
+
+ for(int k = apos; k < apos+alen; k++) {
+ if( b.isEmpty(aix[k]) ) continue;
+ double val = avals[k];
+ int bpos = b.pos(aix[k]);
+ int blen = b.size(aix[k]);
+ int[] bix = b.indexes(aix[k]);
+ double[] bvals = b.values(aix[k]);
+ for(int j = bpos; j < bpos+blen; j++)
+ c[cix+bix[j]] += val * bvals[j];
}
}
}