You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/02/03 06:11:31 UTC
systemml git commit: [SYSTEMML-2106] New single-precision native
matrix multiply
Repository: systemml
Updated Branches:
refs/heads/master 525381d51 -> c95019fd9
[SYSTEMML-2106] New single-precision native matrix multiply
This patch extends - similar to native conv2d/con2d_bias_add operations
- also the native matrix multiply for optional single-precision use.
This also includes cleanups of mkl imports and nnz maintenance in double
and single-precision conv2d operations.
Furthermore, this patch includes build shared libraries for both mkl and
openblas.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c95019fd
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c95019fd
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c95019fd
Branch: refs/heads/master
Commit: c95019fd99076b4b8b7e6c5cfec85fd9949b2512
Parents: 525381d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Feb 2 20:04:53 2018 -0800
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri Feb 2 22:10:33 2018 -0800
----------------------------------------------------------------------
.../cpp/lib/libsystemml_mkl-Linux-x86_64.so | Bin 32048 -> 32104 bytes
.../lib/libsystemml_openblas-Linux-x86_64.so | Bin 31288 -> 36192 bytes
src/main/cpp/libmatrixdnn.cpp | 14 +++---
src/main/cpp/libmatrixmult.cpp | 2 +
src/main/cpp/libmatrixmult.h | 5 +-
src/main/cpp/systemml.cpp | 22 +++++++--
src/main/cpp/systemml.h | 12 +++--
.../runtime/matrix/data/LibMatrixDNNHelper.java | 2 +-
.../runtime/matrix/data/LibMatrixNative.java | 48 ++++++++++++-------
.../org/apache/sysml/utils/NativeHelper.java | 7 ++-
10 files changed, 79 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index 9f08870..db96497 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index d6c9477..2fdcddf 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixdnn.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index 85efdfc..717cb26 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -406,8 +406,8 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
// Step 3: Destroy the description of the operation
dnnDelete_F64(pConvolution);
+ return computeNNZ<double>(retPtr, N*KPQ);
#else
- // ------------------------------------------------------------------------------------
// First step: Avoids oversubscription and other openmp/internal blas threading issues
setNumThreadsForBLAS(1);
@@ -418,8 +418,9 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
// Allocate temporary data structures used in parallel for
int numOpenMPThreads = MIN(numThreads, N);
double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads];
+ int nnz = 0;
-#pragma omp parallel for num_threads(numOpenMPThreads)
+#pragma omp parallel for reduction(+: nnz) num_threads(numOpenMPThreads)
for (int n = 0; n < N; n++) {
int threadID = omp_get_thread_num();
double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
@@ -436,12 +437,13 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
double* outputArr = retPtr + n*KPQ;
if( addBias )
biasAdd<double>(biasPtr, outputArr, K, PQ);
- } // end omp parallel for
+
+ // Step 4: thread-local nnz maintenance
+ nnz += computeNNZ<double>(retPtr + n*KPQ, KPQ);
+ }
delete [] loweredMatArrays;
- // ------------------------------------------------------------------------------------
+ return nnz;
#endif
-
- return computeNNZ<double>(retPtr, N*KPQ);
}
int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr,
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index 6edbc67..3c669b6 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -25,6 +25,8 @@
#ifdef USE_OPEN_BLAS
#include <cblas.h>
+#else
+ #include <mkl_service.h>
#endif
int SYSML_CURRENT_NUM_THREADS = -1;
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
index ca357c9..b6ea1c4 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/libmatrixmult.h
@@ -39,12 +39,11 @@
//#endif
// Since we call cblas_dgemm in openmp for loop,
-// we call "extension" APIs for setting number of threads of the given API.
-// For example: for OpenBLAS we use openblas_set_num_threads and
-// for MKL we use mkl_set_num_threads. This avoids performance degradation due to overprovisioning.
+// we call "extension" APIs for setting the number of threads.
#ifdef USE_INTEL_MKL
#include <mkl.h>
#include <mkl_service.h>
+ extern "C" void mkl_set_num_threads(int numThreads);
#else
#include <cblas.h>
extern "C" void openblas_set_num_threads(int numThreads);
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
index 35a0074..b404cc9 100644
--- a/src/main/cpp/systemml.cpp
+++ b/src/main/cpp/systemml.cpp
@@ -75,14 +75,15 @@ JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
maxThreads = (int) jmaxThreads;
}
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense(
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd(
JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret,
- jint m1rlen, jint m1clen, jint m2clen, jint numThreads) {
+ jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
+{
double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
- return (jboolean) false;
+ return (jboolean) false;
dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
@@ -92,6 +93,21 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDe
return (jboolean) true;
}
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd(
+ JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret,
+ jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
+{
+ float* m1Ptr = (float*) env->GetDirectBufferAddress(m1);
+ float* m2Ptr = (float*) env->GetDirectBufferAddress(m2);
+ float* retPtr = (float*) env->GetDirectBufferAddress(ret);
+ if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
+ return (jboolean) false;
+
+ smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
+
+ return (jboolean) true;
+}
+
JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
(JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean isLeftTranspose, jint numThreads) {
double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h
index 71155fa..52a3663 100644
--- a/src/main/cpp/systemml.h
+++ b/src/main/cpp/systemml.h
@@ -28,14 +28,20 @@ extern "C" {
#endif
/*
* Class: org_apache_sysml_utils_NativeHelper
- * Method: matrixMultDenseDense
- * Signature: ([D[D[DIIII)Z
+ * Method: dmmdd
*/
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint);
/*
* Class: org_apache_sysml_utils_NativeHelper
+ * Method: smmdd
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd
+ (JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint);
+
+/*
+ * Class: org_apache_sysml_utils_NativeHelper
* Method: tsmm
* Signature: ([D[DIIZI)Z
*/
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index f81e929..32a0eaa 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -68,7 +68,7 @@ public class LibMatrixDNNHelper
ret.sparse = false;
if(ret.getDenseBlock() == null)
ret.allocateDenseBlock();
- NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+ NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
ret.getDenseBlockValues(), m1.rlen, m1.clen, m2.clen, 1);
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index dfb8abd..9e3a6ee 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -34,7 +34,10 @@ import org.apache.sysml.utils.Statistics;
public class LibMatrixNative
{
- /** ThreadLocal reuse of direct buffers for inputs/outputs (extended on demand).*/
+ // ThreadLocal reuse of direct buffers for inputs/outputs (extended on demand).
+ // note: since we anyway have to convert from double to float, we use
+ // preallocated direct buffers (with thread-local reuse and resizing on demand)
+ // to ensure there are no additional copies created by the transfer over jni
private static ThreadLocal<FloatBuffer> inBuff = new ThreadLocal<FloatBuffer>();
private static ThreadLocal<FloatBuffer> biasBuff = new ThreadLocal<FloatBuffer>();
private static ThreadLocal<FloatBuffer> filterBuff = new ThreadLocal<FloatBuffer>();
@@ -65,32 +68,45 @@ public class LibMatrixNative
k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
// check inputs / outputs
- if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+ if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)){
ret.setNonZeros(0);
if(examSparsity)
ret.examSparsity(); // turn empty dense into sparse
return;
}
- if (NativeHelper.isNativeLibraryLoaded() &&
- !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+
+ if (NativeHelper.isNativeLibraryLoaded()
+ && !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen)
+ && !m1.isInSparseFormat() && !m2.isInSparseFormat())
+ {
ret.sparse = false;
ret.allocateDenseBlock();
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
- if (NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
- ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
+ boolean rccode = false;
+ if( isSinglePrecision() ) {
+ FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
+ FloatBuffer fin2 = toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
+ FloatBuffer fout = toFloatBuffer(ret.getDenseBlockValues(), outBuff, false);
+ rccode = NativeHelper.smmdd(fin1, fin2, fout,
+ m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
+ fromFloatBuffer(outBuff.get(), ret.getDenseBlockValues());
+ }
+ else {
+ rccode = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+ ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
+ }
+ if (rccode) {
if(DMLScript.STATISTICS) {
Statistics.nativeLibMatrixMultTime += System.nanoTime() - start;
Statistics.numNativeLibMatrixMultCalls.increment();
}
ret.recomputeNonZeros();
- // post-processing (nnz maintained in parallel)
if(examSparsity)
ret.examSparsity();
return;
- } else {
- // Else fall back to Java
- Statistics.incrementNativeFailuresCounter();
}
+ //else record failure and fallback to java
+ Statistics.incrementNativeFailuresCounter();
}
if (k == 1)
LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
@@ -135,14 +151,9 @@ public class LibMatrixNative
else {
if(params.bias.isInSparseFormat())
params.bias.sparseToDense(); // Bias matrix is usually extremely small
- boolean singlePrecision = ConfigurationManager.getDMLConfig()
- .getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
int nnz = -1;
- if( singlePrecision ) {
- //note: since we anyway have to convert from double to float, we use
- //preallocated direct buffers (with thread-local reuse and resizing on demand)
- //to ensure there are no additional copies created by the transfer over jni
+ if( isSinglePrecision() ) {
FloatBuffer finput = toFloatBuffer(input.getDenseBlockValues(), inBuff, true);
FloatBuffer fbias = toFloatBuffer(params.bias.getDenseBlockValues(), biasBuff, true);
FloatBuffer ffilter = toFloatBuffer(filter.getDenseBlockValues(), filterBuff, true);
@@ -260,6 +271,11 @@ public class LibMatrixNative
LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params);
}
+ private static boolean isSinglePrecision() {
+ return ConfigurationManager.getDMLConfig()
+ .getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
+ }
+
private static FloatBuffer toFloatBuffer(double[] input, ThreadLocal<FloatBuffer> buff, boolean copy) {
//maintain thread-local buffer (resized on demand)
FloatBuffer ret = buff.get();
http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
index 6ec990d..25e9847 100644
--- a/src/main/java/org/apache/sysml/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -324,7 +324,12 @@ public class NativeHelper {
}
// TODO: Add pmm, wsloss, mmchain, etc.
- public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+
+ //double-precision matrix multiply dense-dense
+ public static native boolean dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+ //single-precision matrix multiply dense-dense
+ public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+ //transpose-self matrix multiply
private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
// ----------------------------------------------------------------------------------------------------------------