You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2020/11/16 23:39:05 UTC
[systemds] branch master updated: [SYSTEMDS-2725] NNZ counting for
native blas
This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new b350c04 [SYSTEMDS-2725] NNZ counting for native blas
b350c04 is described below
commit b350c04f53fd249cc4c916093404c5d760055327
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Tue Nov 17 00:34:01 2020 +0100
[SYSTEMDS-2725] NNZ counting for native blas
This patch moves nnz count to native blas. This
should improve performance as the native nnz
computation leverages openmp reduction.
In addition, this patch adds:
* a lot of code cleanup
* more fallback to java mat mult
---
src/main/cpp/CMakeLists.txt | 10 +-
src/main/cpp/build.sh | 4 +-
src/main/cpp/{libmatrixmult.h => common.h} | 62 ++++----
src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so | Bin 39416 -> 39384 bytes
src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll | Bin 26624 -> 27648 bytes
.../cpp/lib/libsystemds_openblas-Linux-x86_64.so | Bin 43176 -> 43144 bytes
.../cpp/lib/libsystemds_openblas-Windows-AMD64.dll | Bin 28160 -> 140800 bytes
src/main/cpp/libmatrixdnn.cpp | 40 +++---
src/main/cpp/libmatrixdnn.h | 46 +++---
src/main/cpp/libmatrixmult.cpp | 57 ++------
src/main/cpp/libmatrixmult.h | 47 ++----
src/main/cpp/systemds.cpp | 159 ++++++++++-----------
src/main/cpp/systemds.h | 22 +--
.../sysds/runtime/matrix/data/LibMatrixMult.java | 15 +-
.../sysds/runtime/matrix/data/LibMatrixNative.java | 31 ++--
.../java/org/apache/sysds/utils/NativeHelper.java | 24 ++--
16 files changed, 224 insertions(+), 293 deletions(-)
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 9489f8f..b3de917 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -31,7 +31,7 @@ option(USE_OPEN_BLAS "Whether to use OpenBLAS (Defaults to compiling with Intel
option(USE_INTEL_MKL "Whether to use Intel MKL (Defaults to compiling with Intel MKL)" OFF)
# Build a shared libraray
-set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h)
+set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h common.h)
set(SOURCE_FILES libmatrixdnn.cpp libmatrixmult.cpp systemds.cpp)
# Build a shared libraray
@@ -55,6 +55,14 @@ if (WIN32)
set(CMAKE_SHARED_LIBRARY_PREFIX lib CACHE INTERNAL "")
endif()
+#ToDo: this won't be necessary once we properly redo mkl versions of conv2d* functions
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+ set(CMAKE_CXX_FLAGS "-Wno-narrowing")
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4838")
+endif()
+
set(CMAKE_BUILD_TYPE Release)
if (USE_OPEN_BLAS)
diff --git a/src/main/cpp/build.sh b/src/main/cpp/build.sh
index 4378340..df67aba 100755
--- a/src/main/cpp/build.sh
+++ b/src/main/cpp/build.sh
@@ -47,13 +47,13 @@ if ! [ -x "$(command -v patchelf)" ]; then
fi
# configure and compile INTEL MKL
-cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64"
+cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64"
cmake --build INTEL --target install --config Release
patchelf --add-needed libmkl_rt.so lib/libsystemds_mkl-Linux-x86_64.so
rm -R INTEL
# configure and compile OPENBLAS
-cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64"
+cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64"
cmake --build OPENBLAS --target install --config Release
patchelf --add-needed libopenblas.so.0 lib/libsystemds_openblas-Linux-x86_64.so
rm -R OPENBLAS
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/common.h
similarity index 51%
copy from src/main/cpp/libmatrixmult.h
copy to src/main/cpp/common.h
index a4d33ad..34f9801 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/common.h
@@ -17,31 +17,18 @@
* under the License.
*/
-#ifndef _libmatrixmult_h
-#define _libmatrixmult_h
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-// *****************************************************************
-// We support Intel MKL (recommended) or OpenBLAS.
-// These flags are used for conditional compilation with mkl and openblas
-// #define USE_INTEL_MKL
-// #define USE_GNU_THREADING
-// #define USE_OPEN_BLAS
-// *****************************************************************
-
-//#ifdef __cplusplus
-// extern "C" {
-//#endif
-//#ifdef __cplusplus
-//}
-//#endif
+#pragma once
+#ifndef COMMON_H
+#define COMMON_H
// Since we call cblas_dgemm in openmp for loop,
// we call "extension" APIs for setting the number of threads.
#ifdef USE_INTEL_MKL
#include <mkl.h>
+ #if INTEL_MKL_VERSION < 20170000
+ // Will throw an error at development time in non-standard settings
+ PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS
+ #endif
#include <mkl_service.h>
extern "C" void mkl_set_num_threads(int numThreads);
#else
@@ -49,16 +36,27 @@ extern "C" void mkl_set_num_threads(int numThreads);
extern "C" void openblas_set_num_threads(int numThreads);
#endif
-void setNumThreadsForBLAS(int numThreads);
-
-// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
-// (m1rlen, m1clen) and (m1clen, m2clen)
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
- int numThreads);
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
- int numThreads);
-
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen,
- bool isLeftTrans, int numThreads);
-
+template<class FP>
+size_t computeNNZ(FP* arr, int limit) {
+ size_t nnz = 0;
+#ifndef USE_INTEL_MKL
+#pragma omp parallel for reduction(+: nnz)
+#endif
+ for(int i=0; i<limit; i++)
+ nnz += (arr[i]!=0) ? 1 : 0;
+ return nnz;
+}
+
+static int SYSDS_CURRENT_NUM_THREADS = -1;
+static void setNumThreadsForBLAS(int numThreads) {
+ if (SYSDS_CURRENT_NUM_THREADS != numThreads) {
+#ifdef USE_OPEN_BLAS
+ openblas_set_num_threads(numThreads);
+#else
+ mkl_set_num_threads(numThreads);
#endif
+ SYSDS_CURRENT_NUM_THREADS = numThreads;
+ }
+}
+
+#endif // COMMON_H
diff --git a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so
index 800fe13..f6b495a 100644
Binary files a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll
index 2960551..b52adbe 100644
Binary files a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll differ
diff --git a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so
index 6e45343..e953e7f 100644
Binary files a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll
index a39d40c..6fe9676 100644
Binary files a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll differ
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index 3e3e2f3..da6975c 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -17,30 +17,21 @@
* under the License.
*/
-#include "libmatrixmult.h"
-#include "libmatrixdnn.h"
-#include <cstdlib>
-#include <iostream>
-#include <cstdio>
-#include <cmath>
+#include "common.h"
#include <cstring>
+#include <iostream>
+#include "libmatrixdnn.h"
+#include "libmatrixmult.h"
+
+
#ifdef USE_INTEL_MKL
- #include "mkl_dnn.h"
+#include "mkl_dnn.h"
#else
- #include "omp.h"
-#endif
-
-template<class FP> int computeNNZ(FP* arr, int limit) {
- int nnz = 0;
-#ifndef USE_INTEL_MKL
- #pragma omp parallel for reduction(+: nnz)
+#include "omp.h"
#endif
- for(int i=0; i<limit; i++)
- nnz += (arr[i]!=0) ? 1 : 0;
- return nnz;
-}
-template<class FP> void biasAdd(FP* bias, FP* output, int K, int PQ) {
+template<class FP>
+void biasAdd(FP* bias, FP* output, int K, int PQ) {
for(int k = 0, index=0; k < K; k++)
for(int pq = 0; pq < PQ; pq++, index++)
output[index] += bias[k];
@@ -91,7 +82,8 @@ void col2im(double* inputArray, double* outputArray, int N, int C, int H, int W,
}
}
-template<class FP> void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W,
+template<class FP>
+void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W,
int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) {
int CRS = C * R * S;
std::size_t size = Q * sizeof(FP);
@@ -151,7 +143,7 @@ bool MKL_DNN_ERROR(dnnError_t code) {
}
#endif
-int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
+size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
int N, int C, int H, int W, int K, int R, int S,
int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
int CRS = C*R*S;
@@ -244,7 +236,7 @@ int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
return computeNNZ<double>(retPtr, K*CRS);
}
-int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
int CHW = C * H * W;
#ifdef USE_INTEL_MKL
@@ -370,7 +362,7 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals
delete [] temp1;
}
-int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr,
+size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr,
int N, int C, int H, int W, int K, int R, int S,
int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
int KPQ = K * P * Q;
@@ -446,7 +438,7 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
#endif
}
-int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr,
+size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr,
int N, int C, int H, int W, int K, int R, int S,
int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
int KPQ = K * P * Q;
diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h
index 3f457b0..f3fef37 100644
--- a/src/main/cpp/libmatrixdnn.h
+++ b/src/main/cpp/libmatrixdnn.h
@@ -16,36 +16,34 @@
* specific language governing permissions and limitations
* under the License.
*/
-
-#ifndef _libmatrixdnn_h
-#define _libmatrixdnn_h
-#ifdef USE_INTEL_MKL
- #include <mkl.h>
- #if INTEL_MKL_VERSION < 20170000
- // Will throw an error at development time in non-standard settings
- PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS
- #endif
-#endif
+#pragma once
+#ifndef LIBMATRIXDNN_H
+#define LIBMATRIXDNN_H
-int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
- int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+#include <cstddef>
-int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
- int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K,
+ int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+ int numThreads);
-int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr,
- int N, int C, int H, int W, int K, int R, int S,
- int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
+size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K,
+ int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+ int numThreads);
-int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr,
- int N, int C, int H, int W, int K, int R, int S,
- int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
+size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H,
+ int W, int K, int R, int S,int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+ bool addBias, int numThreads);
+
+size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, int N, int C, int H, int W,
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+ bool addBias, int numThreads);
void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W,
- int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+ int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
-void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret, int N, int C, int H, int W,
- int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret,
+ int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w,
+ int pad_h, int pad_w, int P, int Q, int numThreads);
-#endif
\ No newline at end of file
+#endif // LIBMATRIXDNN_H
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index c0c909a..0d38511 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -17,71 +17,42 @@
* under the License.
*/
+#include "common.h"
#include "libmatrixmult.h"
-#include "omp.h"
-#include <cmath>
-#include <cstdlib>
-#ifdef USE_OPEN_BLAS
-#include <cblas.h>
-#else
-#include <mkl_service.h>
-#endif
-
-static int SYSDS_CURRENT_NUM_THREADS = -1;
-void setNumThreadsForBLAS(int numThreads) {
- if (SYSDS_CURRENT_NUM_THREADS != numThreads) {
-#ifdef USE_OPEN_BLAS
- openblas_set_num_threads(numThreads);
-#else
- mkl_set_num_threads(numThreads);
-#endif
- SYSDS_CURRENT_NUM_THREADS = numThreads;
- }
-}
-
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
- int numThreads) {
+void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads) {
// BLAS routine dispatch according to input dimension sizes (we don't use
// cblas_dgemv with CblasColMajor for matrix-vector because it was generally
// slower than dgemm)
setNumThreadsForBLAS(numThreads);
if (m == 1 && n == 1) // VV
- retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1);
+ retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1);
else if (n == 1) // MV
- cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0,
- retPtr, 1);
+ cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);
else // MM
- cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k,
- m2Ptr, n, 0, retPtr, n);
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
}
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
- int numThreads) {
+void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads) {
// BLAS routine dispatch according to input dimension sizes (we don't use
// cblas_sgemv with CblasColMajor for matrix-vector because it was generally
// slower than sgemm)
setNumThreadsForBLAS(numThreads);
if (m == 1 && n == 1) // VV
- retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1);
+ retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1);
else if (n == 1) // MV
- cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0,
- retPtr, 1);
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);
else // MM
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k,
- m2Ptr, n, 0, retPtr, n);
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
}
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans,
- int numThreads) {
+void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans, int numThreads) {
setNumThreadsForBLAS(numThreads);
- if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1)) {
- retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1);
- } else { // general case
+ if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1))
+ retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1);
+ else { // general case
int n = leftTrans ? m1clen : m1rlen;
int k = leftTrans ? m1rlen : m1clen;
- cblas_dsyrk(CblasRowMajor, CblasUpper,
- leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0,
- retPtr, n);
+ cblas_dsyrk(CblasRowMajor, CblasUpper, leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n);
}
}
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
index a4d33ad..f38155d 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/libmatrixmult.h
@@ -17,48 +17,19 @@
* under the License.
*/
-#ifndef _libmatrixmult_h
-#define _libmatrixmult_h
+#ifndef LIBMATRIXMULT_H
+#define LIBMATRIXMULT_H
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-// *****************************************************************
-// We support Intel MKL (recommended) or OpenBLAS.
-// These flags are used for conditional compilation with mkl and openblas
-// #define USE_INTEL_MKL
-// #define USE_GNU_THREADING
-// #define USE_OPEN_BLAS
-// *****************************************************************
+// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape (m1rlen, m1clen) and (m1clen, m2clen)
+void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads);
-//#ifdef __cplusplus
-// extern "C" {
-//#endif
-//#ifdef __cplusplus
-//}
-//#endif
+// Same matrix multiply for single precision
+void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads);
-// Since we call cblas_dgemm in openmp for loop,
-// we call "extension" APIs for setting the number of threads.
-#ifdef USE_INTEL_MKL
-#include <mkl.h>
-#include <mkl_service.h>
-extern "C" void mkl_set_num_threads(int numThreads);
-#else
-#include <cblas.h>
-extern "C" void openblas_set_num_threads(int numThreads);
-#endif
+// Matrix transpose
+void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads);
-void setNumThreadsForBLAS(int numThreads);
-
-// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
-// (m1rlen, m1clen) and (m1clen, m2clen)
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
- int numThreads);
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
- int numThreads);
-
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen,
- bool isLeftTrans, int numThreads);
-
-#endif
+#endif // LIBMATRIXMULT_H
diff --git a/src/main/cpp/systemds.cpp b/src/main/cpp/systemds.cpp
index 193e847..47b6e8f 100644
--- a/src/main/cpp/systemds.cpp
+++ b/src/main/cpp/systemds.cpp
@@ -16,28 +16,11 @@
* specific language governing permissions and limitations
* under the License.
*/
-
-#include "systemds.h"
-#include "libmatrixmult.h"
-#include "libmatrixdnn.h"
-
-// Linux:
-// g++ -o lib/libsystemds_mkl-Linux-x86_64.so *.cpp -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread -lm -ldl -DUSE_INTEL_MKL -DUSE_MKL_DNN -L$MKLROOT/lib/intel64 -m64 -O3 -shared -fPIC
-// g++ -o lib/libsystemds_openblas-Linux-x86_64.so *.cpp -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -I/opt/OpenBLAS/include/ -L/opt/OpenBLAS/lib/ -fopenmp -O3 -shared -fPIC
-// Mac OSX:
-// g++ -o libsystemds_mkl-linux-x86_64.dylib *.cpp -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread -lm -ldl -DUSE_INTEL_MKL -DUSE_GNU_THREADING -L$MKLROOT/lib/intel64 -m64 -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
-// g++ -o libsystemds_openblas-linux-x86_64.dylib *.cpp -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -L/opt/OpenBLAS/lib/ -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
-
-// Windows MKL:
-// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
-// "%MKLROOT%"\bin\mklvars.bat intel64
-// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
-// cl *.cpp -I. -I"%MKLROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_INTEL_MKL -Fesystemds_mkl-windows-x86_64.dll -MD -LD "%MKLROOT%"\lib\intel64_win\mkl_intel_thread_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_core_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_intel_lp64_dll.lib
-// Windows OpenBLAS:
-// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
-// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
-// cl *.cpp -I. -I"%OPENBLASROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_OPEN_BLAS -Fesystemds_openblas-windows-x86_64.dll -MD -LD "%OPENBLASROOT%"\lib\libopenblas.dll.a
+#include "common.h"
+#include "libmatrixdnn.h"
+#include "libmatrixmult.h"
+#include "systemds.h"
// Results from Matrix-vector/vector-matrix 1M x 1K, dense show that GetDoubleArrayElements creates a copy on OpenJDK.
@@ -67,14 +50,12 @@
// ( maxThreads != -1 && ((int)numThreads) == maxThreads ? env->ReleasePrimitiveArrayCritical(input, inputPtr, 0) : env->ReleaseDoubleArrayElements(input, inputPtr, 0) )
// -------------------------------------------------------------------
-
-int maxThreads = -1;
JNIEXPORT void JNICALL Java_org_apache_sysds_utils_NativeHelper_setMaxNumThreads
(JNIEnv *, jclass, jint jmaxThreads) {
- maxThreads = (int) jmaxThreads;
+ setNumThreadsForBLAS(jmaxThreads);
}
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret,
jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
{
@@ -82,17 +63,19 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
- return (jboolean) false;
+ return -1;
dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
+ size_t nnz = computeNNZ<double>(retPtr, m1rlen * m2clen);
RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads);
RELEASE_INPUT_ARRAY(env, m2, m2Ptr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jboolean) true;
+
+ return static_cast<jlong>(nnz);
}
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret,
jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
{
@@ -100,38 +83,40 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
float* m2Ptr = (float*) env->GetDirectBufferAddress(m2);
float* retPtr = (float*) env->GetDirectBufferAddress(ret);
if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
- return (jboolean) false;
+ return -1;
smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
- return (jboolean) true;
+ return static_cast<jlong>(computeNNZ<float>(retPtr, m1rlen * m2clen));
}
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
(JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean leftTrans, jint numThreads) {
double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(m1Ptr == NULL || retPtr == NULL)
- return (jboolean) false;
+ return -1;
tsmm(m1Ptr, retPtr, (int)m1rlen, (int)m1clen, (bool)leftTrans, (int)numThreads);
-
+ size_t nnz = computeNNZ<double>(retPtr, m1rlen * m1clen);
+
RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jboolean) true;
+
+ return static_cast<jlong>(nnz);
}
JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse
- (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+ (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
conv2dSparse((int)apos, (int)alen, aixPtr, avalsPtr, filterPtr, retPtr, (int)N, (int)C, (int)H, (int)W,
- (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+ (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads);
RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
@@ -141,16 +126,16 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse
}
JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterSparseDense
- (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+ (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
conv2dBackwardFilterSparseDense((int)apos, (int)alen, aixPtr, avalsPtr, doutPtr, retPtr, (int)N, (int)C, (int)H, (int)W,
- (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+ (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads);
RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
@@ -159,104 +144,106 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwa
return (jboolean) true;
}
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense(
- JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense(
+ JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
{
double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL)
- return (jint) -1;
+ return -1;
- int nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr,
- (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
- (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads);
+ size_t nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+ (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+ (int) Q, false, (int) numThreads);
RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jint) nnz;
+ return static_cast<jlong>(nnz);
}
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense(
- JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense(
+ JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
{
double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
double* biasPtr = GET_DOUBLE_ARRAY(env, bias, numThreads);
double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
- return (jint) -1;
+ return -1;
- int nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr,
- (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
- (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
+ size_t nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+ (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+ (int) Q, true, (int) numThreads);
RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
RELEASE_INPUT_ARRAY(env, bias, biasPtr, numThreads);
RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jint) nnz;
+ return static_cast<jlong>(nnz);
}
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense(
- JNIEnv* env, jclass, jobject input, jobject bias, jobject filter,
- jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense(
+ JNIEnv* env, jclass, jobject input, jobject bias, jobject filter,
+ jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
{
float* inputPtr = (float*) env->GetDirectBufferAddress(input);
float* biasPtr = (float*) env->GetDirectBufferAddress(bias);
float* filterPtr = (float*) env->GetDirectBufferAddress(filter);
float* retPtr = (float*) env->GetDirectBufferAddress(ret);
if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
- return (jint) -1;
+ return -1;
- int nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr,
- (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
- (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
-
- return (jint) nnz;
+ size_t nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+ (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+ (int) Q, true, (int) numThreads);
+
+ return static_cast<jlong>(nnz);
}
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense(
- JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense(
+ JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL)
- return (jint) -1;
+ return -1;
- int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
- (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+ size_t nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+ (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w,
+ (int) P, (int) Q, (int) numThreads);
RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jint) nnz;
+ return static_cast<jlong>(nnz);
}
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense(
- JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
- jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
- jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense(
+ JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
+ jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+ jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL)
- return (jint) -1;
+ return -1;
- int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
- (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+ size_t nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int)N, (int) C, (int) H, (int) W, (int) K, (int) R,
+ (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+ (int) Q, (int) numThreads);
RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
RELEASE_ARRAY(env, ret, retPtr, numThreads);
- return (jint) nnz;
+ return static_cast<jlong>(nnz);
}
diff --git a/src/main/cpp/systemds.h b/src/main/cpp/systemds.h
index d84392f..b4f741f 100644
--- a/src/main/cpp/systemds.h
+++ b/src/main/cpp/systemds.h
@@ -29,25 +29,25 @@ extern "C" {
/*
* Class: org_apache_sysds_utils_NativeHelper
* Method: dmmdd
- * Signature: ([D[D[DIIII)Z
+ * Signature: ([D[D[DIIII)I
*/
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint);
/*
* Class: org_apache_sysds_utils_NativeHelper
* Method: smmdd
- * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)Z
+ * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)I
*/
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd
(JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint);
/*
* Class: org_apache_sysds_utils_NativeHelper
* Method: tsmm
- * Signature: ([D[DIIZI)Z
+ * Signature: ([D[DIIZI)I
*/
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jint, jint, jboolean, jint);
/*
@@ -55,7 +55,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
* Method: conv2dDense
* Signature: ([D[D[DIIIIIIIIIIIIII)I
*/
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
/*
@@ -63,7 +63,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
* Method: dconv2dBiasAddDense
* Signature: ([D[D[D[DIIIIIIIIIIIIII)I
*/
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
/*
@@ -71,7 +71,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDe
* Method: sconv2dBiasAddDense
* Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIIIIIIIIIIIII)I
*/
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense
(JNIEnv *, jclass, jobject, jobject, jobject, jobject, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
/*
@@ -79,7 +79,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDe
* Method: conv2dBackwardFilterDense
* Signature: ([D[D[DIIIIIIIIIIIIII)I
*/
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
/*
@@ -87,7 +87,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFi
* Method: conv2dBackwardDataDense
* Signature: ([D[D[DIIIIIIIIIIIIII)I
*/
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense
(JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
/*
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 228aec7..f8d5436 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -28,6 +28,8 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.stream.IntStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.util.FastMath;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.lops.MapMultChain.ChainType;
@@ -68,7 +70,8 @@ public class LibMatrixMult
private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP
public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
-
+ private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
+
private LibMatrixMult() {
//prevent instantiation via private constructor
}
@@ -2490,9 +2493,13 @@ public class LibMatrixMult
//call native matrix multiplication (only called for single-threaded and matrix-vector
//because this ensures that we can deal with the transpose mV without additional transpose)
- if(!NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
- ((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1) )
- throw new DMLRuntimeException("Error executing native matrix mult.");
+ long nnz =NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
+ ((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1);
+ if(nnz < 0) {
+ //fallback to default java implementation
+ LOG.warn("matrixMultWSigmoidDenseNative: Native mat mult failed. Falling back to java version.");
+ matrixMult(((m==1)?mV:mU), ((m==1)?mU:mV), ret, false);
+ }
//compute remaining wsigmoid for all relevant outputs
for( int i=0; i<m*n; i++ ) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index 5d33e2e..d64a936 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -88,25 +88,26 @@ public class LibMatrixNative
ret.sparse = false;
ret.allocateDenseBlock();
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
- boolean rccode = false;
+ long nnz;
if( isSinglePrecision() ) {
FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
FloatBuffer fin2 = toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
FloatBuffer fout = toFloatBuffer(ret.getDenseBlockValues(), outBuff, false);
- rccode = NativeHelper.smmdd(fin1, fin2, fout,
+ nnz = NativeHelper.smmdd(fin1, fin2, fout,
m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
fromFloatBuffer(outBuff.get(), ret.getDenseBlockValues());
}
else {
- rccode = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+ nnz = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
}
- if (rccode) {
+
+ if(nnz > -1) {
if(DMLScript.STATISTICS) {
Statistics.nativeLibMatrixMultTime += System.nanoTime() - start;
Statistics.numNativeLibMatrixMultCalls.increment();
}
- ret.recomputeNonZeros();
+ ret.setNonZeros(nnz);
if(examSparsity)
ret.examSparsity();
return;
@@ -114,6 +115,8 @@ public class LibMatrixNative
//else record failure and fallback to java
Statistics.incrementNativeFailuresCounter();
}
+ //fallback to default java implementation
+ LOG.warn("matrixMult: Native mat mult failed. Falling back to java version.");
if (k == 1)
LibMatrixMult.matrixMult(m1, m2, ret, !examSparsity);
else
@@ -127,18 +130,16 @@ public class LibMatrixNative
&& (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) {
ret.sparse = false;
ret.allocateDenseBlock();
- if( NativeHelper.tsmm(m1.getDenseBlockValues(),
- ret.getDenseBlockValues(), m1.rlen, m1.clen, leftTrans, k) )
- {
- // ToDo: add native tsmm() to stats
- long nnz = (ret.clen==1) ? ret.recomputeNonZeros() :
- LibMatrixMult.copyUpperToLowerTriangle(ret);
+
+ long nnz = NativeHelper.tsmm(m1.getDenseBlockValues(), ret.getDenseBlockValues(),
+ m1.rlen, m1.clen, leftTrans, k);
+ if(nnz > -1) {
ret.setNonZeros(nnz);
ret.examSparsity();
return;
}
//fallback to default java implementation
- LOG.info("Falling back to java TSMM()");
+ LOG.warn("Native TSMM failed. Falling back to java version.");
Statistics.incrementNativeFailuresCounter();
}
if( k > 1 )
@@ -161,7 +162,7 @@ public class LibMatrixNative
if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
setNumThreads(params);
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
- int nnz = 0;
+ long nnz;
if(params.bias == null) {
nnz = NativeHelper.conv2dDense(input.getDenseBlockValues(), filter.getDenseBlockValues(),
outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W,
@@ -237,7 +238,7 @@ public class LibMatrixNative
if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
setNumThreads(params);
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
- int nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(),
+ long nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(),
outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W,
params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
params.P, params.Q, params.numThreads);
@@ -273,7 +274,7 @@ public class LibMatrixNative
if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
setNumThreads(params);
long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
- int nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(),
+ long nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(),
outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W,
params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w,
params.P, params.Q, params.numThreads);
diff --git a/src/main/java/org/apache/sysds/utils/NativeHelper.java b/src/main/java/org/apache/sysds/utils/NativeHelper.java
index cd9f8f9..83869c2 100644
--- a/src/main/java/org/apache/sysds/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysds/utils/NativeHelper.java
@@ -333,14 +333,15 @@ public class NativeHelper {
* @return true if successfully loaded BLAS
*/
public static boolean loadLibraryHelperFromResource(String libFileName) {
- OutputStream out = null;
try(InputStream in = NativeHelper.class.getResourceAsStream("/lib/"+ libFileName)) {
// This logic is added because Java does not allow to load library from a resource file.
if(in != null) {
File temp = File.createTempFile(libFileName, "");
temp.deleteOnExit();
- out = FileUtils.openOutputStream(temp);
+ OutputStream out = FileUtils.openOutputStream(temp);
IOUtils.copy(in, out);
+ // not closing the stream here makes dll loading fail on Windows
+ IOUtilFunctions.closeSilently(out);
System.load(temp.getAbsolutePath());
return true;
}
@@ -350,22 +351,19 @@ public class NativeHelper {
catch(IOException e) {
LOG.warn("Unable to load library " + libFileName + " from resource:" + e.getMessage());
}
- finally {
- IOUtilFunctions.closeSilently(out);
- }
return false;
}
// TODO: Add pmm, wsloss, mmchain, etc.
//double-precision matrix multiply dense-dense
- public static native boolean dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen,
+ public static native long dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen,
int numThreads);
//single-precision matrix multiply dense-dense
- public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen,
+ public static native long smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen,
int numThreads);
//transpose-self matrix multiply
- public static native boolean tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads);
+ public static native long tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads);
// ----------------------------------------------------------------------------------------------------------------
// LibMatrixDNN operations:
@@ -376,25 +374,25 @@ public class NativeHelper {
// Returns -1 if failures or returns number of nonzeros
// Called by DnnCPInstruction if both input and filter are dense
- public static native int conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W,
+ public static native long conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W,
int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
- public static native int dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N,
+ public static native long dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N,
int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
int numThreads);
- public static native int sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret,
+ public static native long sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret,
int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
int numThreads);
// Called by DnnCPInstruction if both input and filter are dense
- public static native int conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C,
+ public static native long conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C,
int H, int W, int K, int R, int S, int stride_h, int stride_w,
int pad_h, int pad_w, int P, int Q, int numThreads);
// If both filter and dout are dense, then called by DnnCPInstruction
// Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
- public static native int conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C,
+ public static native long conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C,
int H, int W, int K, int R, int S, int stride_h, int stride_w,
int pad_h, int pad_w, int P, int Q, int numThreads);