You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2020/11/16 23:39:05 UTC

[systemds] branch master updated: [SYSTEMDS-2725] NNZ counting for native blas

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new b350c04  [SYSTEMDS-2725] NNZ counting for native blas
b350c04 is described below

commit b350c04f53fd249cc4c916093404c5d760055327
Author: Mark Dokter <ma...@dokter.cc>
AuthorDate: Tue Nov 17 00:34:01 2020 +0100

    [SYSTEMDS-2725] NNZ counting for native blas
    
    This patch moves nnz count to native blas. This
    should improve performance as the native nnz
    computation leverages openmp reduction.
    In addition, this patch adds:
    * a lot of code cleanup
    * more fallback to java mat mult
---
 src/main/cpp/CMakeLists.txt                        |  10 +-
 src/main/cpp/build.sh                              |   4 +-
 src/main/cpp/{libmatrixmult.h => common.h}         |  62 ++++----
 src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so   | Bin 39416 -> 39384 bytes
 src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll | Bin 26624 -> 27648 bytes
 .../cpp/lib/libsystemds_openblas-Linux-x86_64.so   | Bin 43176 -> 43144 bytes
 .../cpp/lib/libsystemds_openblas-Windows-AMD64.dll | Bin 28160 -> 140800 bytes
 src/main/cpp/libmatrixdnn.cpp                      |  40 +++---
 src/main/cpp/libmatrixdnn.h                        |  46 +++---
 src/main/cpp/libmatrixmult.cpp                     |  57 ++------
 src/main/cpp/libmatrixmult.h                       |  47 ++----
 src/main/cpp/systemds.cpp                          | 159 ++++++++++-----------
 src/main/cpp/systemds.h                            |  22 +--
 .../sysds/runtime/matrix/data/LibMatrixMult.java   |  15 +-
 .../sysds/runtime/matrix/data/LibMatrixNative.java |  31 ++--
 .../java/org/apache/sysds/utils/NativeHelper.java  |  24 ++--
 16 files changed, 224 insertions(+), 293 deletions(-)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 9489f8f..b3de917 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -31,7 +31,7 @@ option(USE_OPEN_BLAS "Whether to use OpenBLAS (Defaults to compiling with Intel
 option(USE_INTEL_MKL "Whether to use Intel MKL (Defaults to compiling with Intel MKL)" OFF)
 
 # Build a shared libraray
-set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h)
+set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h common.h)
 set(SOURCE_FILES libmatrixdnn.cpp libmatrixmult.cpp systemds.cpp)
 
 # Build a shared libraray
@@ -55,6 +55,14 @@ if (WIN32)
     set(CMAKE_SHARED_LIBRARY_PREFIX lib CACHE INTERNAL "")
 endif()
 
+#ToDo: this won't be necessary once we properly redo mkl versions of conv2d* functions
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      set(CMAKE_CXX_FLAGS "-Wno-narrowing")
+endif()
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4838")
+endif()
+
 set(CMAKE_BUILD_TYPE Release)
 
 if (USE_OPEN_BLAS)
diff --git a/src/main/cpp/build.sh b/src/main/cpp/build.sh
index 4378340..df67aba 100755
--- a/src/main/cpp/build.sh
+++ b/src/main/cpp/build.sh
@@ -47,13 +47,13 @@ if ! [ -x "$(command -v patchelf)" ]; then
 fi
 
 # configure and compile INTEL MKL
-cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64"
+cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64"
 cmake --build INTEL --target install --config Release
 patchelf --add-needed libmkl_rt.so lib/libsystemds_mkl-Linux-x86_64.so
 rm -R INTEL
 
 # configure and compile OPENBLAS
-cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64"
+cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64"
 cmake --build OPENBLAS --target install --config Release
 patchelf --add-needed libopenblas.so.0 lib/libsystemds_openblas-Linux-x86_64.so
 rm -R OPENBLAS
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/common.h
similarity index 51%
copy from src/main/cpp/libmatrixmult.h
copy to src/main/cpp/common.h
index a4d33ad..34f9801 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/common.h
@@ -17,31 +17,18 @@
  * under the License.
  */
 
-#ifndef _libmatrixmult_h
-#define _libmatrixmult_h
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-// *****************************************************************
-// We support Intel MKL (recommended) or OpenBLAS.
-// These flags are used for conditional compilation with mkl and openblas
-// #define USE_INTEL_MKL
-// #define USE_GNU_THREADING
-// #define USE_OPEN_BLAS
-// *****************************************************************
-
-//#ifdef __cplusplus
-// extern "C" {
-//#endif
-//#ifdef __cplusplus
-//}
-//#endif
+#pragma once
+#ifndef COMMON_H
+#define COMMON_H
 
 // Since we call cblas_dgemm in openmp for loop,
 // we call "extension" APIs for setting the number of threads.
 #ifdef USE_INTEL_MKL
 #include <mkl.h>
+	#if INTEL_MKL_VERSION < 20170000
+		// Will throw an error at development time in non-standard settings
+		PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS
+	#endif
 #include <mkl_service.h>
 extern "C" void mkl_set_num_threads(int numThreads);
 #else
@@ -49,16 +36,27 @@ extern "C" void mkl_set_num_threads(int numThreads);
 extern "C" void openblas_set_num_threads(int numThreads);
 #endif
 
-void setNumThreadsForBLAS(int numThreads);
-
-// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
-// (m1rlen, m1clen) and (m1clen, m2clen)
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
-              int numThreads);
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
-              int numThreads);
-
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen,
-          bool isLeftTrans, int numThreads);
-
+template<class FP>
+size_t computeNNZ(FP* arr, int limit) {
+    size_t nnz = 0;
+#ifndef USE_INTEL_MKL
+#pragma omp parallel for reduction(+: nnz)
+#endif
+    for(int i=0; i<limit; i++)
+        nnz += (arr[i]!=0) ? 1 : 0;
+    return nnz;
+}
+
+static int SYSDS_CURRENT_NUM_THREADS = -1;
+static void setNumThreadsForBLAS(int numThreads) {
+	if (SYSDS_CURRENT_NUM_THREADS != numThreads) {
+#ifdef USE_OPEN_BLAS
+		openblas_set_num_threads(numThreads);
+#else
+		mkl_set_num_threads(numThreads);
 #endif
+		SYSDS_CURRENT_NUM_THREADS = numThreads;
+	}
+}
+
+#endif // COMMON_H
diff --git a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so
index 800fe13..f6b495a 100644
Binary files a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll
index 2960551..b52adbe 100644
Binary files a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll differ
diff --git a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so
index 6e45343..e953e7f 100644
Binary files a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so differ
diff --git a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll
index a39d40c..6fe9676 100644
Binary files a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll differ
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index 3e3e2f3..da6975c 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -17,30 +17,21 @@
  * under the License.
  */
 
-#include "libmatrixmult.h"
-#include "libmatrixdnn.h"
-#include <cstdlib>
-#include <iostream>
-#include <cstdio>
-#include <cmath>
+#include "common.h"
 #include <cstring>
+#include <iostream>
+#include "libmatrixdnn.h"
+#include "libmatrixmult.h"
+
+
 #ifdef USE_INTEL_MKL
-  #include "mkl_dnn.h"
+#include "mkl_dnn.h"
 #else
-  #include "omp.h"
-#endif
-
-template<class FP> int computeNNZ(FP* arr, int limit) {
-  int nnz = 0;
-#ifndef USE_INTEL_MKL
-  #pragma omp parallel for reduction(+: nnz)
+#include "omp.h"
 #endif
-  for(int i=0; i<limit; i++)
-    nnz += (arr[i]!=0) ? 1 : 0;
-  return nnz;
-}
 
-template<class FP> void biasAdd(FP* bias, FP* output, int K, int PQ) {
+template<class FP>
+void biasAdd(FP* bias, FP* output, int K, int PQ) {
   for(int k = 0, index=0; k < K; k++)
     for(int pq = 0; pq < PQ; pq++, index++)
       output[index] += bias[k];
@@ -91,7 +82,8 @@ void col2im(double* inputArray, double* outputArray, int N, int C, int H, int W,
 	}
 }
 
-template<class FP> void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W,
+template<class FP>
+void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W,
     int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) {
   int CRS = C * R * S;
   std::size_t size = Q * sizeof(FP);
@@ -151,7 +143,7 @@ bool MKL_DNN_ERROR(dnnError_t code) {
 } 
 #endif
 
-int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
+size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
     int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
   int CRS = C*R*S;
@@ -244,7 +236,7 @@ int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr,
   return computeNNZ<double>(retPtr, K*CRS);
 }
 
-int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
   int CHW = C * H * W;
 #ifdef USE_INTEL_MKL
@@ -370,7 +362,7 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals
 	delete [] temp1;
 }
 
-int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, 
+size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, 
     int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
   int KPQ = K * P * Q;
@@ -446,7 +438,7 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
 #endif
 }
 
-int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, 
+size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, 
     int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
   int KPQ = K * P * Q;
diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h
index 3f457b0..f3fef37 100644
--- a/src/main/cpp/libmatrixdnn.h
+++ b/src/main/cpp/libmatrixdnn.h
@@ -16,36 +16,34 @@
  * specific language governing permissions and limitations
  * under the License.
  */
- 
-#ifndef _libmatrixdnn_h
-#define _libmatrixdnn_h
 
-#ifdef USE_INTEL_MKL
-  #include <mkl.h>
-	#if INTEL_MKL_VERSION < 20170000
-		// Will throw an error at development time in non-standard settings
-		PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS
-	#endif
-#endif
+#pragma once
+#ifndef LIBMATRIXDNN_H
+#define LIBMATRIXDNN_H
 
-int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
-    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+#include <cstddef> 
 
-int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
-    int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K,
+							  int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+							  int numThreads);
 
-int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr,
-   int N, int C, int H, int W, int K, int R, int S,
-   int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
+size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K,
+							int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+							int numThreads);
 
-int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr,
-   int N, int C, int H, int W, int K, int R, int S,
-   int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
+size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H,
+						int W, int K, int R, int S,int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+						bool addBias, int numThreads);
+
+size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, int N, int C, int H, int W,
+						int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
+						bool addBias, int numThreads);
 
 void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W, 
-			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+				  int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 
-void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret, int N, int C, int H, int W, 
-			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
+void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret,
+									 int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w,
+									 int pad_h, int pad_w, int P, int Q, int numThreads);
 			
-#endif
\ No newline at end of file
+#endif // LIBMATRIXDNN_H
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index c0c909a..0d38511 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -17,71 +17,42 @@
  * under the License.
  */
 
+#include "common.h"
 #include "libmatrixmult.h"
-#include "omp.h"
-#include <cmath>
-#include <cstdlib>
 
-#ifdef USE_OPEN_BLAS
-#include <cblas.h>
-#else
-#include <mkl_service.h>
-#endif
-
-static int SYSDS_CURRENT_NUM_THREADS = -1;
-void setNumThreadsForBLAS(int numThreads) {
-  if (SYSDS_CURRENT_NUM_THREADS != numThreads) {
-#ifdef USE_OPEN_BLAS
-    openblas_set_num_threads(numThreads);
-#else
-    mkl_set_num_threads(numThreads);
-#endif
-    SYSDS_CURRENT_NUM_THREADS = numThreads;
-  }
-}
-
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
-              int numThreads) {
+void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads) {
   // BLAS routine dispatch according to input dimension sizes (we don't use
   // cblas_dgemv with CblasColMajor for matrix-vector because it was generally
   // slower than dgemm)
   setNumThreadsForBLAS(numThreads);
   if (m == 1 && n == 1) // VV
-    retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1);
+	  retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1);
   else if (n == 1) // MV
-    cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0,
-                retPtr, 1);
+	  cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);
   else // MM
-    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k,
-                m2Ptr, n, 0, retPtr, n);
+	  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
 }
 
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
-              int numThreads) {
+void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads) {
   // BLAS routine dispatch according to input dimension sizes (we don't use
   // cblas_sgemv with CblasColMajor for matrix-vector because it was generally
   // slower than sgemm)
   setNumThreadsForBLAS(numThreads);
   if (m == 1 && n == 1) // VV
-    retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1);
+	  retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1);
   else if (n == 1) // MV
-    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0,
-                retPtr, 1);
+	  cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1);
   else // MM
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k,
-                m2Ptr, n, 0, retPtr, n);
+	  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n);
 }
 
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans,
-          int numThreads) {
+void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans, int numThreads) {
   setNumThreadsForBLAS(numThreads);
-  if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1)) {
-    retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1);
-  } else { // general case
+  if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1))
+	retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1);
+  else { // general case
     int n = leftTrans ? m1clen : m1rlen;
     int k = leftTrans ? m1rlen : m1clen;
-    cblas_dsyrk(CblasRowMajor, CblasUpper,
-                leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0,
-                retPtr, n);
+    cblas_dsyrk(CblasRowMajor, CblasUpper, leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n);
   }
 }
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
index a4d33ad..f38155d 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/libmatrixmult.h
@@ -17,48 +17,19 @@
  * under the License.
  */
 
-#ifndef _libmatrixmult_h
-#define _libmatrixmult_h
+#ifndef LIBMATRIXMULT_H
+#define LIBMATRIXMULT_H
 
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
-// *****************************************************************
-// We support Intel MKL (recommended) or OpenBLAS.
-// These flags are used for conditional compilation with mkl and openblas
-// #define USE_INTEL_MKL
-// #define USE_GNU_THREADING
-// #define USE_OPEN_BLAS
-// *****************************************************************
+// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape (m1rlen, m1clen) and (m1clen, m2clen)
+void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads);
 
-//#ifdef __cplusplus
-// extern "C" {
-//#endif
-//#ifdef __cplusplus
-//}
-//#endif
+// Same matrix multiply for single precision
+void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads);
 
-// Since we call cblas_dgemm in openmp for loop,
-// we call "extension" APIs for setting the number of threads.
-#ifdef USE_INTEL_MKL
-#include <mkl.h>
-#include <mkl_service.h>
-extern "C" void mkl_set_num_threads(int numThreads);
-#else
-#include <cblas.h>
-extern "C" void openblas_set_num_threads(int numThreads);
-#endif
+// Matrix transpose
+void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads);
 
-void setNumThreadsForBLAS(int numThreads);
-
-// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape
-// (m1rlen, m1clen) and (m1clen, m2clen)
-void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n,
-              int numThreads);
-void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n,
-              int numThreads);
-
-void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen,
-          bool isLeftTrans, int numThreads);
-
-#endif
+#endif // LIBMATRIXMULT_H
diff --git a/src/main/cpp/systemds.cpp b/src/main/cpp/systemds.cpp
index 193e847..47b6e8f 100644
--- a/src/main/cpp/systemds.cpp
+++ b/src/main/cpp/systemds.cpp
@@ -16,28 +16,11 @@
  * specific language governing permissions and limitations
  * under the License.
  */
- 
-#include "systemds.h"
-#include "libmatrixmult.h"
-#include "libmatrixdnn.h"
-
-// Linux:
-// g++ -o lib/libsystemds_mkl-Linux-x86_64.so *.cpp  -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread  -lm -ldl -DUSE_INTEL_MKL -DUSE_MKL_DNN -L$MKLROOT/lib/intel64 -m64 -O3 -shared -fPIC
-// g++ -o lib/libsystemds_openblas-Linux-x86_64.so *.cpp  -I$JAVA_HOME/include  -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -I/opt/OpenBLAS/include/ -L/opt/OpenBLAS/lib/ -fopenmp -O3 -shared -fPIC
 
-// Mac OSX:	
-// g++ -o libsystemds_mkl-linux-x86_64.dylib *.cpp  -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread  -lm -ldl -DUSE_INTEL_MKL -DUSE_GNU_THREADING -L$MKLROOT/lib/intel64 -m64 -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
-// g++ -o libsystemds_openblas-linux-x86_64.dylib *.cpp  -I$JAVA_HOME/include  -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -L/opt/OpenBLAS/lib/ -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup
-
-// Windows MKL: 
-// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
-// "%MKLROOT%"\bin\mklvars.bat intel64
-// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
-// cl *.cpp -I. -I"%MKLROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_INTEL_MKL -Fesystemds_mkl-windows-x86_64.dll -MD -LD  "%MKLROOT%"\lib\intel64_win\mkl_intel_thread_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_core_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_intel_lp64_dll.lib
-// Windows OpenBLAS:
-// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64
-// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25
-// cl *.cpp -I. -I"%OPENBLASROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_OPEN_BLAS -Fesystemds_openblas-windows-x86_64.dll -MD -LD "%OPENBLASROOT%"\lib\libopenblas.dll.a
+#include "common.h"
+#include "libmatrixdnn.h"
+#include "libmatrixmult.h"
+#include "systemds.h"
 
 // Results from Matrix-vector/vector-matrix 1M x 1K, dense show that GetDoubleArrayElements creates a copy on OpenJDK.
 
@@ -67,14 +50,12 @@
 // ( maxThreads != -1 && ((int)numThreads) == maxThreads ? env->ReleasePrimitiveArrayCritical(input, inputPtr, 0) :  env->ReleaseDoubleArrayElements(input, inputPtr, 0) )
   
 // -------------------------------------------------------------------
-
-int maxThreads = -1;
 JNIEXPORT void JNICALL Java_org_apache_sysds_utils_NativeHelper_setMaxNumThreads
   (JNIEnv *, jclass, jint jmaxThreads) {
-  maxThreads = (int) jmaxThreads;
+	setNumThreadsForBLAS(jmaxThreads);
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
     JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret,
     jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
 {
@@ -82,17 +63,19 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd(
   double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
-    return (jboolean) false;
+    return -1;
 
   dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
+  size_t nnz = computeNNZ<double>(retPtr, m1rlen * m2clen);
 
   RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads);
   RELEASE_INPUT_ARRAY(env, m2, m2Ptr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads); 
-  return (jboolean) true;
+
+  return static_cast<jlong>(nnz);
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
     JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret,
     jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
 {
@@ -100,38 +83,40 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd(
   float* m2Ptr = (float*) env->GetDirectBufferAddress(m2);
   float* retPtr = (float*) env->GetDirectBufferAddress(ret);
   if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
-    return (jboolean) false;
+    return -1;
 
   smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads);
 
-  return (jboolean) true;
+  return static_cast<jlong>(computeNNZ<float>(retPtr, m1rlen * m2clen));
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
   (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean leftTrans, jint numThreads) {
   double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(m1Ptr == NULL || retPtr == NULL)
-  	return (jboolean) false;
+  	return -1;
 
   tsmm(m1Ptr, retPtr, (int)m1rlen, (int)m1clen, (bool)leftTrans, (int)numThreads);
-  
+  size_t nnz = computeNNZ<double>(retPtr, m1rlen * m1clen);
+
   RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);
-  return (jboolean) true;
+
+  return static_cast<jlong>(nnz);
 }
 
 JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse
-  (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter,
-    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+		(JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter,
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
   int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
   double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   
   conv2dSparse((int)apos, (int)alen, aixPtr, avalsPtr, filterPtr, retPtr, (int)N, (int)C, (int)H, (int)W, 
-			(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+		(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
   
   RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
@@ -141,16 +126,16 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse
 }
 
 JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterSparseDense
-  (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout,  
-  	jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+		(JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout,  
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
   int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL));
   double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads);
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   
   conv2dBackwardFilterSparseDense((int)apos, (int)alen, aixPtr, avalsPtr, doutPtr, retPtr, (int)N, (int)C, (int)H, (int)W, 
-			(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
+		(int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads);
   
   RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
@@ -159,104 +144,106 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwa
   return (jboolean) true;
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense(
-    JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
-    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense(
+		JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
 {
   double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL)
-    return (jint) -1;
+    return -1;
   
-  int nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr,
-    (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
-    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads);
+  size_t nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+		(int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+		(int) Q, false, (int) numThreads);
   
   RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);
-  return (jint) nnz;
+  return static_cast<jlong>(nnz);
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense(
-    JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
-    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense(
+		JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads)
 {
   double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
   double* biasPtr = GET_DOUBLE_ARRAY(env, bias, numThreads);
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
-    return (jint) -1;
+	return -1;
   
-  int nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr,
-    (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
-    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
+  size_t nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+		(int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+		(int) Q, true, (int) numThreads);
   
   RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, bias, biasPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);
-  return (jint) nnz;
+  return static_cast<jlong>(nnz);
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense(
-    JNIEnv* env, jclass, jobject input, jobject bias, jobject filter,
-    jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) 
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense(
+		JNIEnv* env, jclass, jobject input, jobject bias, jobject filter,
+		jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) 
 {
   float* inputPtr = (float*) env->GetDirectBufferAddress(input);
   float* biasPtr =  (float*) env->GetDirectBufferAddress(bias);
   float* filterPtr = (float*) env->GetDirectBufferAddress(filter);
   float* retPtr = (float*) env->GetDirectBufferAddress(ret);
   if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
-    return (jint) -1;
+    return -1;
   
-  int nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, 
-    (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
-    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
-  
-  return (jint) nnz;
+  size_t nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+		(int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+		(int) Q, true, (int) numThreads);
+
+  return static_cast<jlong>(nnz);
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense(
-	JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
-    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense(
+		JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
   
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL)
-  	return (jint) -1;
+  	return -1;
   
-  int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
-    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+  size_t nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K,
+		(int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w,
+		(int) P, (int) Q, (int) numThreads);
   
   RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);
-  return (jint) nnz;
+  return static_cast<jlong>(nnz);
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense(
-	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
-    jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
-    jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense(
+		JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
+		jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
+		jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
   double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads);
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL)
-  	return (jint) -1;
+  	return -1;
   
-  int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
-    (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
+  size_t nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int)N, (int) C, (int) H, (int) W, (int) K, (int) R,
+		(int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P,
+		(int) Q, (int) numThreads);
   
   RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_ARRAY(env, ret, retPtr, numThreads);
-  return (jint) nnz;
+  return static_cast<jlong>(nnz);
 }
diff --git a/src/main/cpp/systemds.h b/src/main/cpp/systemds.h
index d84392f..b4f741f 100644
--- a/src/main/cpp/systemds.h
+++ b/src/main/cpp/systemds.h
@@ -29,25 +29,25 @@ extern "C" {
 /*
  * Class:     org_apache_sysds_utils_NativeHelper
  * Method:    dmmdd
- * Signature: ([D[D[DIIII)Z
+ * Signature: ([D[D[DIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysds_utils_NativeHelper
  * Method:    smmdd
- * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)Z
+ * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd
   (JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysds_utils_NativeHelper
  * Method:    tsmm
- * Signature: ([D[DIIZI)Z
+ * Signature: ([D[DIIZI)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jint, jint, jboolean, jint);
 
 /*
@@ -55,7 +55,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm
  * Method:    conv2dDense
  * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
@@ -63,7 +63,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense
  * Method:    dconv2dBiasAddDense
  * Signature: ([D[D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
@@ -71,7 +71,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDe
  * Method:    sconv2dBiasAddDense
  * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIIIIIIIIIIIII)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense
   (JNIEnv *, jclass, jobject, jobject, jobject, jobject, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
@@ -79,7 +79,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDe
  * Method:    conv2dBackwardFilterDense
  * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
@@ -87,7 +87,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFi
  * Method:    conv2dBackwardDataDense
  * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense
+JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 228aec7..f8d5436 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -28,6 +28,8 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.stream.IntStream;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.commons.math3.util.FastMath;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.lops.MapMultChain.ChainType;
@@ -68,7 +70,8 @@ public class LibMatrixMult
 	private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP
 	public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size)
 	public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size)
-	
+	private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName());
+
 	private LibMatrixMult() {
 		//prevent instantiation via private constructor
 	}
@@ -2490,9 +2493,13 @@ public class LibMatrixMult
 		
 		//call native matrix multiplication (only called for single-threaded and matrix-vector
 		//because this ensures that we can deal with the transpose mV without additional transpose)
-		if(!NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
-			((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1) )
-			throw new DMLRuntimeException("Error executing native matrix mult.");
+		long nnz =NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(),
+			((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1);
+		if(nnz < 0) {
+			//fallback to default java implementation
+			LOG.warn("matrixMultWSigmoidDenseNative: Native mat mult failed. Falling back to java version.");
+			matrixMult(((m==1)?mV:mU), ((m==1)?mU:mV), ret, false);
+		}
 		
 		//compute remaining wsigmoid for all relevant outputs
 		for( int i=0; i<m*n; i++ ) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index 5d33e2e..d64a936 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -88,25 +88,26 @@ public class LibMatrixNative
 			ret.sparse = false;
 			ret.allocateDenseBlock();
 			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
-			boolean rccode = false;
+			long nnz;
 			if( isSinglePrecision() ) {
 				FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
 				FloatBuffer fin2 = toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
 				FloatBuffer fout = toFloatBuffer(ret.getDenseBlockValues(), outBuff, false);
-				rccode = NativeHelper.smmdd(fin1, fin2, fout, 
+				nnz = NativeHelper.smmdd(fin1, fin2, fout, 
 					m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
 				fromFloatBuffer(outBuff.get(), ret.getDenseBlockValues());
 			}
 			else {
-				rccode = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+				nnz = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
 					ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
 			}
-			if (rccode) {
+			
+			if(nnz > -1) {
 				if(DMLScript.STATISTICS) {
 					Statistics.nativeLibMatrixMultTime += System.nanoTime() - start;
 					Statistics.numNativeLibMatrixMultCalls.increment();
 				}
-				ret.recomputeNonZeros();
+				ret.setNonZeros(nnz);
 				if(examSparsity)
 					ret.examSparsity();
 				return;
@@ -114,6 +115,8 @@ public class LibMatrixNative
 			//else record failure and fallback to java
 			Statistics.incrementNativeFailuresCounter();
 		}
+		//fallback to default java implementation
+		LOG.warn("matrixMult: Native mat mult failed. Falling back to java version.");
 		if (k == 1)
 			LibMatrixMult.matrixMult(m1, m2, ret, !examSparsity);
 		else
@@ -127,18 +130,16 @@ public class LibMatrixNative
 			&& (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) {
 			ret.sparse = false;
 			ret.allocateDenseBlock();
-			if( NativeHelper.tsmm(m1.getDenseBlockValues(), 
-				ret.getDenseBlockValues(), m1.rlen, m1.clen, leftTrans, k) ) 
-			{
-				// ToDo: add native tsmm() to stats
-				long nnz = (ret.clen==1) ? ret.recomputeNonZeros() :
-					LibMatrixMult.copyUpperToLowerTriangle(ret);
+			
+			long nnz = NativeHelper.tsmm(m1.getDenseBlockValues(), ret.getDenseBlockValues(), 
+										 m1.rlen, m1.clen, leftTrans, k); 
+			if(nnz > -1) {
 				ret.setNonZeros(nnz);
 				ret.examSparsity();
 				return;
 			}
 			//fallback to default java implementation
-			LOG.info("Falling back to java TSMM()");
+			LOG.warn("Native TSMM failed. Falling back to java version.");
 			Statistics.incrementNativeFailuresCounter();
 		}
 		if( k > 1 )
@@ -161,7 +162,7 @@ public class LibMatrixNative
 		if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
 			setNumThreads(params);
 			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
-			int nnz = 0;
+			long nnz;
 			if(params.bias == null) {
 				nnz = NativeHelper.conv2dDense(input.getDenseBlockValues(), filter.getDenseBlockValues(),
 						outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, 
@@ -237,7 +238,7 @@ public class LibMatrixNative
 		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
 			setNumThreads(params);
 			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
-			int nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(),
+			long nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(),
 					outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, 
 					params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
 					params.P, params.Q, params.numThreads);
@@ -273,7 +274,7 @@ public class LibMatrixNative
 		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
 			setNumThreads(params);
 			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
-			int nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(),
+			long nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(),
 					outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, 
 					params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
 					params.P, params.Q, params.numThreads);
diff --git a/src/main/java/org/apache/sysds/utils/NativeHelper.java b/src/main/java/org/apache/sysds/utils/NativeHelper.java
index cd9f8f9..83869c2 100644
--- a/src/main/java/org/apache/sysds/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysds/utils/NativeHelper.java
@@ -333,14 +333,15 @@ public class NativeHelper {
 	 * @return true if successfully loaded BLAS
 	 */
 	public static boolean loadLibraryHelperFromResource(String libFileName)  {
-		OutputStream out = null;
 		try(InputStream in = NativeHelper.class.getResourceAsStream("/lib/"+ libFileName)) {
 			// This logic is added because Java does not allow to load library from a resource file.
 			if(in != null) {
 				File temp = File.createTempFile(libFileName, "");
 				temp.deleteOnExit();
-				out = FileUtils.openOutputStream(temp);
+				OutputStream out = FileUtils.openOutputStream(temp);
 				IOUtils.copy(in, out);
+				// not closing the stream here makes dll loading fail on Windows
+				IOUtilFunctions.closeSilently(out);
 				System.load(temp.getAbsolutePath());
 				return true;
 			}
@@ -350,22 +351,19 @@ public class NativeHelper {
 		catch(IOException e) {
 			LOG.warn("Unable to load library " + libFileName + " from resource:" + e.getMessage());
 		}
-		finally {
-			IOUtilFunctions.closeSilently(out);
-		}
 		return false;
 	}
 
 	// TODO: Add pmm, wsloss, mmchain, etc.
 	
 	//double-precision matrix multiply dense-dense
-	public static native boolean dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen,
+	public static native long dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen,
 									   int numThreads);
 	//single-precision matrix multiply dense-dense
-	public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen,
+	public static native long smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen,
 									   int numThreads);
 	//transpose-self matrix multiply
-	public static native boolean tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads);
+	public static native long tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads);
 
 	// ----------------------------------------------------------------------------------------------------------------
 	// LibMatrixDNN operations:
@@ -376,25 +374,25 @@ public class NativeHelper {
 
 	// Returns -1 if failures or returns number of nonzeros
 	// Called by DnnCPInstruction if both input and filter are dense
-	public static native int conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
+	public static native long conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 
-	public static native int dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N,
+	public static native long dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N,
 		int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
 												 int numThreads);
 
-	public static native int sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret,
+	public static native long sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret,
 		int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q,
 												 int numThreads);
 
 	// Called by DnnCPInstruction if both input and filter are dense
-	public static native int conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C,
+	public static native long conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C,
 													   int H, int W, int K, int R, int S, int stride_h, int stride_w,
 													   int pad_h, int pad_w, int P, int Q, int numThreads);
 
 	// If both filter and dout are dense, then called by DnnCPInstruction
 	// Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
-	public static native int conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C,
+	public static native long conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C,
 													 int H, int W, int K, int R, int S, int stride_h, int stride_w,
 													 int pad_h, int pad_w, int P, int Q, int numThreads);