You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/05/09 03:57:09 UTC

incubator-systemml git commit: [SYSTEMML-540] [MINOR] Added additional performance counters for native invocation and improved SGDNesterov

Repository: incubator-systemml
Updated Branches:
  refs/heads/master b8de68b74 -> 5489c665d


[SYSTEMML-540] [MINOR] Added additional performance counters for native invocation and improved SGDNesterov

Closes #485.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5489c665
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5489c665
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5489c665

Branch: refs/heads/master
Commit: 5489c665dcb434d88c165492557ef14ad284b69f
Parents: b8de68b
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Mon May 8 20:55:24 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Mon May 8 20:56:11 2017 -0700

----------------------------------------------------------------------
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 27408 -> 31824 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 27416 -> 27520 bytes
 src/main/cpp/libmatrixdnn.cpp                   |  54 ++++++++++------
 src/main/cpp/libmatrixdnn.h                     |   6 +-
 src/main/cpp/systemml.cpp                       |  32 +++++-----
 src/main/cpp/systemml.h                         |  29 ++++-----
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  24 ++++---
 .../runtime/matrix/data/LibMatrixNative.java    |  63 +++++++++++++------
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  14 +++++
 .../apache/sysml/udf/lib/SGDNesterovUpdate.java |  13 ++--
 .../org/apache/sysml/utils/NativeHelper.java    |   9 +--
 .../java/org/apache/sysml/utils/Statistics.java |  45 +++++++++++--
 .../org/apache/sysml/api/dl/Caffe2DML.scala     |   2 +-
 .../org/apache/sysml/api/dl/CaffeSolver.scala   |   9 ++-
 14 files changed, 199 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index 0a6427a..a740930 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index ffdcd5a..dfd1ecb 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index a521804..ed02042 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -27,6 +27,14 @@
 #include <cstring>
 #include "omp.h"
 
+int computeNNZ(double* arr, int limit) {
+  int nnz = 0;
+  #pragma omp parallel for reduction(+: nnz)
+  for(int i=0; i<limit; i++)
+    nnz += (arr[i]!=0) ? 1 : 0;
+  return nnz;
+}
+
 void rotate180(double* inputArray, double* outputArray, int N, int C, int H, int W,
             int K, int R, int S, int stride_h, int stride_w, int pad_h,
             int pad_w, int P, int Q) {
@@ -124,7 +132,7 @@ void im2col(double* inputArray, double* outputArray, int N, int C, int H, int W,
 } 
 
 
-void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
   // First step: Avoids oversubscription and other openmp/internal blas threading issues
   setNumThreadsForBLAS(1);
@@ -150,7 +158,8 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr
 
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
-  	double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num();
+    int threadID = omp_get_thread_num();
+  	double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
 
     // Step 1: Perform im2col
     im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
@@ -158,18 +167,22 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr
            P, Q);
            
     // Step 2: Rotate dout
-    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num();
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID;
     rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
     
-    // Multiply to get CRS X K
-    double* temp1 = temp + numTempElem*omp_get_thread_num();
-    // Step 3: loweredMat (CRS X PQ) %*% rotated_dout (PQ X K) 
-    matmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1);
-              
+    // Multiply to get tmp1 = CRS X K
+    double* temp1 = temp + numTempElem*threadID;
+    // Step 3: temp1 = alpha * (loweredMat (CRS X PQ) %*% rotated_dout (PQ X K)) + beta*temp1
+    int m1rlen = C * R * S; int m1clen = P * Q; int m2clen = K;
+    double* m1Ptr = loweredMat; double* m2Ptr = rotatedDoutPtr; double alpha = 1; double beta = 1;
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m1rlen, m2clen, m1clen, alpha, m1Ptr, m1clen, m2Ptr, m2clen, beta, temp1, m2clen);
   } // end omp parallel for
   
+  delete [] loweredMatArrays;
+  delete [] rotatedDoutPtrArrays;
+  
   // Inplace transpose addition
   int numRow = CRS;
   for(int t = 0; t < numOpenMPThreads; t++) {
@@ -184,11 +197,10 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr
   }
   
   delete [] temp;
-  delete [] loweredMatArrays;
-  delete [] rotatedDoutPtrArrays;
+  return computeNNZ(retPtr, K*CRS);
 }
 
-void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) {
    // First step: Avoids oversubscription and other openmp/internal blas threading issues
   setNumThreadsForBLAS(1);
@@ -207,27 +219,28 @@ void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr,
 
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
+    int threadID = omp_get_thread_num();
     // Step 1: Rotate dout
-    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num();
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID;
     rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
 
     // Step 2: t(rotatedDout (PQ X K) %*% filter (K X CRS))
-    double* col2imInput = col2imInputArrays + numCol2ImElem*omp_get_thread_num();
+    double* col2imInput = col2imInputArrays + numCol2ImElem*threadID;
     matmult(rotatedDoutPtr, filterPtr, col2imInput,
             PQ, K, CRS, 1);
 
     // Step 3: Perform col2im
-    col2im(col2imInput, retPtr + n * CHW, 1, C, H, W, K,
+    double* outputArr = retPtr + n * CHW;
+    col2im(col2imInput, outputArr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
-
   } // end omp parallel for
   
   delete [] rotatedDoutPtrArrays;
   delete [] col2imInputArrays;
-    
+  return computeNNZ(retPtr, N*CHW);
 }
 
 void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filterPtr, double* retPtr, int N, int C, int H, int W, 
@@ -290,7 +303,8 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals
 	delete [] temp1;
 }
 
-void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+
+int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) {
   // First step:  Avoids oversubscription and other openmp/internal blas threading issues
   setNumThreadsForBLAS(1);
@@ -306,7 +320,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
   
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
-    double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num();
+    int threadID = omp_get_thread_num();
+    double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
 
     // Step 1: Perform im2col
     im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
@@ -318,8 +333,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
             C * R * S, P * Q, 1);
     
     // Step 3: Add bias
+    double* outputArr = retPtr + n*KPQ;
     if(addBias) {
-	    double* outputArr = retPtr + n*KPQ;
 	    int index = 0;
 		for(int k = 0; k < K; k++) {
 			for(int pq = 0; pq < PQ; pq++, index++) {
@@ -330,4 +345,5 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do
   } // end omp parallel for
   
   delete [] loweredMatArrays;
+  return computeNNZ(retPtr, N*KPQ);
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h
index bf6c113..86e7b24 100644
--- a/src/main/cpp/libmatrixdnn.h
+++ b/src/main/cpp/libmatrixdnn.h
@@ -20,13 +20,13 @@
 #ifndef _libmatrixdnn_h
 #define _libmatrixdnn_h
 
-void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 
-void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
     
-void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads);
     
 void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W, 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
index 41ce0bc..34ae248 100644
--- a/src/main/cpp/systemml.cpp
+++ b/src/main/cpp/systemml.cpp
@@ -144,7 +144,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   return (jboolean) true;
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
 	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
@@ -152,18 +152,18 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL)
-  	return (jboolean) false;
+  	return (jint) -1;
   
-  conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads);
     
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense(
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense(
 	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
@@ -173,19 +173,19 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAd
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL)
-  	return (jboolean) false;
+  	return (jint) -1;
   
-  conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads);
     
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, bias, biasPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense(
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense(
 	JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
@@ -194,18 +194,18 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL)
-  	return (jboolean) false;
+  	return (jint) -1;
   
-  conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
   
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense(
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense(
 	JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) {
@@ -213,13 +213,13 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL)
-  	return (jboolean) false;
+  	return (jint) -1;
   
-  conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads);
   
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
-  return (jboolean) true;
+  return (jint) nnz;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h
index ac36495..f6f5cd2 100644
--- a/src/main/cpp/systemml.h
+++ b/src/main/cpp/systemml.h
@@ -45,49 +45,49 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
  * Method:    conv2dDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
  * Method:    conv2dBiasAddDense
- * Signature: ([D[D[D[DIIIIIIIIIIIIII)Z
+ * Signature: ([D[D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardDataDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Method:    conv2dBackwardFilterDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardFilterDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Method:    conv2dBackwardDataDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dSparse
+ * Method:    conv2dBackwardFilterSparseDense
  * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
   (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardFilterSparse
+ * Method:    conv2dSparse
  * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
   (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
@@ -103,4 +103,5 @@ JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
 #endif
 #endif
 
+
  
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index e4d3ba2..ab82697 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -84,7 +84,7 @@ public class LibMatrixDNN {
 	private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);
 	
 	public static void appendStatistics(StringBuilder sb) {
-		if(DMLScript.STATISTICS && DISPLAY_STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
+		if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
 			sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" 
 					+ conv2dDenseCount.get() + "/"
 					+ conv2dBwdFilterDenseCount.get() + "/"
@@ -97,15 +97,13 @@ public class LibMatrixDNN {
 					+ conv2dBwdDataSparseCount.get() + "/"
 					+ im2colSparseCount.get() + "/"
 					+ maxPoolBwdSparseCount.get() + ".\n");
-			if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
-				sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
-						String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
-						String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" + 
-						String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
-						String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
-						String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
-						String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
-			}
+			sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
+					String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
+					String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" + 
+					String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
+					String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
+					String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
+					String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
 		}
 	}
 	public static void resetStatistics() {
@@ -158,7 +156,7 @@ public class LibMatrixDNN {
 			params.bias.sparseToDense(); // Since bias is extremely small array
 		
 		if(isEligibleForConv2dSparse(params))
-			Statistics.numNativeLibMatrixDNNCalls.increment();
+			Statistics.numNativeSparseConv2dCalls.increment();
 		
 		runConvTask(TaskType.LoopedIm2ColConv2d, params);
 		
@@ -179,7 +177,7 @@ public class LibMatrixDNN {
 		checkInputsConv2dBackwardData(filter, dout, outputBlock, params);
 		
 		if(isEligibleForConv2dBackwardDataDense(params))
-			Statistics.numNativeLibMatrixDNNCalls.increment();
+			Statistics.numNativeSparseConv2dBwdDataCalls.increment();
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
 		
@@ -200,7 +198,7 @@ public class LibMatrixDNN {
 		checkInputsConv2dBackwardFilter(input, dout, outputBlock, params);
 		
 		if(isEligibleForConv2dBackwardFilterSparseDense(params))
-			Statistics.numNativeLibMatrixDNNCalls.increment();
+			Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
 		
 		runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
 		

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index 4b12596..524218d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -18,6 +18,7 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.utils.NativeHelper;
@@ -60,9 +61,13 @@ public class LibMatrixNative {
 				!isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
 			ret.sparse = false;
 			ret.allocateDenseBlock();
+			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
 			if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, 
 					ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
-				Statistics.numNativeLibMatrixMultCalls.increment();
+				if(DMLScript.STATISTICS) {
+					Statistics.nativeLibMatrixMultTime += System.nanoTime() - start;
+					Statistics.numNativeLibMatrixMultCalls.increment();
+				}
 				ret.recomputeNonZeros();
 				// post-processing (nnz maintained in parallel)
 				if(examSparsity)
@@ -94,12 +99,17 @@ public class LibMatrixNative {
 		if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) {
 			setNumThreads(params);
 			if(params.bias == null) {
-				if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+				long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				int nnz = NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
 						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-						params.P, params.Q, params.numThreads)) {
-					Statistics.numNativeLibMatrixDNNCalls.increment();
+						params.P, params.Q, params.numThreads);
+				if(nnz != -1) {
+					if(DMLScript.STATISTICS) {
+						Statistics.nativeConv2dTime += System.nanoTime() - start;
+						Statistics.numNativeConv2dCalls.increment();
+					}
 					// post-processing: maintain nnz
-					outputBlock.recomputeNonZeros();
+					outputBlock.setNonZeros(nnz);
 					return;
 				}
 				else {
@@ -110,13 +120,18 @@ public class LibMatrixNative {
 			else {
 				if(params.bias.isInSparseFormat())
 					params.bias.sparseToDense(); // Bias matrix is usually extremely small
-				if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, 
+				long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				int nnz = NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, 
 						params.N, params.C, params.H, params.W, 
 						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-						params.P, params.Q, params.numThreads)) {
-					Statistics.numNativeLibMatrixDNNCalls.increment();
+						params.P, params.Q, params.numThreads);
+				if(nnz != -1) {
+					if(DMLScript.STATISTICS) {
+						Statistics.nativeConv2dTime += System.nanoTime() - start;
+						Statistics.numNativeConv2dCalls.increment();
+					}
 					// post-processing: maintain nnz
-					outputBlock.recomputeNonZeros();
+					outputBlock.setNonZeros(nnz);
 					return;
 				}
 				else {
@@ -150,12 +165,17 @@ public class LibMatrixNative {
 		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
 		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) {
 			setNumThreads(params);
-			if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
-						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-						params.P, params.Q, params.numThreads)) {
-				Statistics.numNativeLibMatrixDNNCalls.increment();
+			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+			int nnz = NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+					params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+					params.P, params.Q, params.numThreads);
+			if(nnz != -1) {
+				if(DMLScript.STATISTICS) {
+					Statistics.nativeConv2dBwdFilterTime += System.nanoTime() - start;
+					Statistics.numNativeConv2dBwdFilterCalls.increment();
+				}
 				// post-processing: maintain nnz
-				outputBlock.recomputeNonZeros();
+				outputBlock.setNonZeros(nnz);
 				return;
 			}
 			else {
@@ -181,12 +201,17 @@ public class LibMatrixNative {
 		params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads;
 		if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) {
 			setNumThreads(params);
-			if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
-						params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-						params.P, params.Q, params.numThreads)) {
-				Statistics.numNativeLibMatrixDNNCalls.increment();
+			long start = DMLScript.STATISTICS ? System.nanoTime() : 0;
+			int nnz = NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+					params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+					params.P, params.Q, params.numThreads);
+			if(nnz != -1) {
+				if(DMLScript.STATISTICS) {
+					Statistics.nativeConv2dBwdDataTime += System.nanoTime() - start;
+					Statistics.numNativeConv2dBwdDataCalls.increment();
+				}
 				// post-processing: maintain nnz
-				outputBlock.recomputeNonZeros();
+				outputBlock.setNonZeros(nnz);
 				return;
 			}
 			else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index ff6a007..233350a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -34,6 +34,7 @@ import java.util.stream.LongStream;
 
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -86,6 +87,7 @@ import org.apache.sysml.runtime.util.FastBufferedDataOutputStream;
 import org.apache.sysml.runtime.util.IndexRange;
 import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 
 
@@ -104,6 +106,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	//basic header (int rlen, int clen, byte type)
 	public static final int HEADER_SIZE = 9;
 	
+	private static final boolean DISPLAY_STATISTICS = false; // Developer flag to measure performance overhead of various functions in this class
+	
 	public enum BlockType{
 		EMPTY_BLOCK,  
 		ULTRA_SPARSE_BLOCK, //ultra sparse representation, in-mem same as sparse
@@ -336,6 +340,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			allocateDenseBlock();
 	}
 	
+	@SuppressWarnings("unused")
 	public void allocateDenseBlock(boolean clearNNZ) 
 			throws RuntimeException 
 	{
@@ -350,7 +355,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		
 		//allocate block if non-existing or too small (guaranteed to be 0-initialized),
 		if(denseBlock == null || denseBlock.length < limit) {
+			long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0;
 			denseBlock = new double[(int)limit];
+			Statistics.allocateDoubleArrTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
 		}
 		
 		//clear nnz if necessary
@@ -986,9 +993,11 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * 
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
+	@SuppressWarnings("unused")
 	public void examSparsity() 
 		throws DMLRuntimeException
 	{
+		long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0;
 		//determine target representation
 		boolean sparseDst = evalSparseFormatInMemory(); 
 				
@@ -1002,6 +1011,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 			sparseToDense();
 		else if( !sparse && sparseDst )
 			denseToSparse();
+		
+		Statistics.examSparsityTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
 	}
 	
 	/**
@@ -1141,6 +1152,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 	 * of the entire matrix block.
 	 * 
 	 */
+	@SuppressWarnings("unused")
 	public void recomputeNonZeros()
 	{
 		if( sparse && sparseBlock!=null ) //SPARSE (max long)
@@ -1150,12 +1162,14 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		}
 		else if( !sparse && denseBlock!=null ) //DENSE (max int)
 		{
+			long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0;
 			double[] a = denseBlock;
 			final int limit=rlen*clen;
 			int nnz = 0;
 			for(int i=0; i<limit; i++)
 				nnz += (a[i]!=0) ? 1 : 0;
 			nonZeros = nnz;
+			Statistics.recomputeNNZTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
index 9c027d3..a88f230 100644
--- a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
+++ b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
@@ -39,7 +39,7 @@ import org.apache.sysml.udf.Matrix.ValueType;
  * Assumption: the input batch fits in CP (which is also the assumption of most deep learning systems).
  * 
  * Usage:
- * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) implemented in (classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem");
+ * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v, double lambda) return (matrix[double] X, matrix[double] v) implemented in (classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem");
  * [X, v] = update_nesterov(X, dX, lr, mu, v);
  * 
  * 
@@ -81,16 +81,19 @@ public class SGDNesterovUpdate extends PackageFunction {
 			double mu = Double.parseDouble(((Scalar)getFunctionInput(3)).getValue());
 			MatrixBlock v = ((Matrix) getFunctionInput(4)).getMatrixObject().acquireRead();
 			
-			// v = mu * v - lr * dX
+			double lambda = Double.parseDouble(((Scalar)getFunctionInput(5)).getValue());
+			
+			// v = mu * v - lr * dX - lr*lambda*X
 			updatedV = new Matrix( "tmp_" + rand.nextLong(), v.getNumRows(), v.getNumColumns(), ValueType.Double );
 			MatrixBlock updatedVMB = allocateDenseMatrixBlock(updatedV);
 			double [] updatedVData = updatedVMB.getDenseBlock();
-			if(isDense(v) && isDense(dX)) {
+			if(isDense(v) && isDense(dX) && isDense(X)) {
 				double [] vArr = v.getDenseBlock();
 				double [] dXArr = dX.getDenseBlock();
+				double [] XArr = X.getDenseBlock();
 				int nnz = 0;
 				for(int i = 0; i < updatedVData.length; i++) {
-					updatedVData[i] = mu*vArr[i] - lr*dXArr[i];
+					updatedVData[i] = mu*vArr[i] - lr*dXArr[i] - lr*lambda*XArr[i];
 					nnz += (updatedVData[i]!=0) ? 1 : 0;
 				}
 				updatedVMB.setNonZeros(nnz); 
@@ -98,8 +101,10 @@ public class SGDNesterovUpdate extends PackageFunction {
 			else {
 				multiplyByConstant(v, mu, updatedVData);
 				multiplyByConstant(dX, -lr, updatedVData);
+				multiplyByConstant(X, -lr*lambda, updatedVData);
 				updatedVMB.recomputeNonZeros();
 			}
+			
 			updatedV.setMatrixDoubleArray(updatedVMB, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo);
 			
 			// X = X - mu * v_prev + (1 + mu) * v

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java
index 129824b..fe5e085 100644
--- a/src/main/java/org/apache/sysml/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -254,17 +254,18 @@ public class NativeHelper {
 	// K = number of filters, R = filter height, S = filter width
 	// TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority.
 	
+	// Returns -1 if failures or returns number of nonzeros
 	// Called by ConvolutionCPInstruction if both input and filter are dense
-	public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
+	public static native int conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, 
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
-	public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
+	public static native int conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 	// Called by ConvolutionCPInstruction if both input and filter are dense
-	public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, 
+	public static native int conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, 
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 	// If both filter and dout are dense, then called by ConvolutionCPInstruction
 	// Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse.
-	public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, 
+	public static native int conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, 
 			int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);
 	
 	// Currently only supported with numThreads = 1 and sparse input

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java
index 97888cb..49633d8 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -114,7 +114,21 @@ public class Statistics
 	
 	private static LongAdder numNativeFailures = new LongAdder();
 	public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
-	public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+	public static LongAdder numNativeConv2dCalls = new LongAdder();
+	public static LongAdder numNativeConv2dBwdDataCalls = new LongAdder();
+	public static LongAdder numNativeConv2dBwdFilterCalls = new LongAdder();
+	public static LongAdder numNativeSparseConv2dCalls = new LongAdder();
+	public static LongAdder numNativeSparseConv2dBwdFilterCalls = new LongAdder();
+	public static LongAdder numNativeSparseConv2dBwdDataCalls = new LongAdder();
+	public static long nativeLibMatrixMultTime = 0;
+	public static long nativeConv2dTime = 0;
+	public static long nativeConv2dBwdDataTime = 0;
+	public static long nativeConv2dBwdFilterTime = 0;
+	
+	public static long recomputeNNZTime = 0;
+	public static long examSparsityTime = 0;
+	public static long allocateDoubleArrTime = 0;
+	
 	public static void incrementNativeFailuresCounter() {
 		numNativeFailures.increment();
 		// This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case.
@@ -378,8 +392,17 @@ public class Statistics
 
 		GPUStatistics.reset();
 		numNativeLibMatrixMultCalls.reset();
-		numNativeLibMatrixDNNCalls.reset();
+		numNativeSparseConv2dCalls.reset();
+		numNativeSparseConv2dBwdDataCalls.reset();
+		numNativeSparseConv2dBwdFilterCalls.reset();
+		numNativeConv2dCalls.reset();
+		numNativeConv2dBwdDataCalls.reset();
+		numNativeConv2dBwdFilterCalls.reset();
 		numNativeFailures.reset();
+		nativeLibMatrixMultTime = 0;
+		nativeConv2dTime = 0;
+		nativeConv2dBwdFilterTime = 0;
+		nativeConv2dBwdDataTime = 0;
 		LibMatrixDNN.resetStatistics();
 	}
 
@@ -635,11 +658,23 @@ public class Statistics
 		//show extended caching/compilation statistics
 		if( DMLScript.STATISTICS ) 
 		{
-			if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 || 
-					numNativeLibMatrixDNNCalls.longValue() > 0)) {
+			if(NativeHelper.blasType != null) {
 				String blas = NativeHelper.blasType != null ? NativeHelper.blasType : ""; 
-				sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue()  + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+				sb.append("Native " + blas + " calls (dense mult/conv/bwdF/bwdD):\t" + numNativeLibMatrixMultCalls.longValue()  + "/" + 
+						numNativeConv2dCalls.longValue() + "/" + numNativeConv2dBwdFilterCalls.longValue()
+						+ "/" + numNativeConv2dBwdDataCalls.longValue() + ".\n");
+				sb.append("Native " + blas + " calls (sparse conv/bwdF/bwdD):\t" +  
+						numNativeSparseConv2dCalls.longValue() + "/" + numNativeSparseConv2dBwdFilterCalls.longValue()
+						+ "/" + numNativeSparseConv2dBwdDataCalls.longValue() + ".\n");
+				sb.append("Native " + blas + " times (dense mult/conv/bwdF/bwdD):\t" + String.format("%.3f", nativeLibMatrixMultTime*1e-9) + "/" +
+						String.format("%.3f", nativeConv2dTime*1e-9) + "/" + String.format("%.3f", nativeConv2dBwdFilterTime*1e-9) + "/" + 
+						String.format("%.3f", nativeConv2dBwdDataTime*1e-9) + ".\n");
+			}
+			if(recomputeNNZTime != 0 || examSparsityTime != 0 || allocateDoubleArrTime != 0) {
+				sb.append("MatrixBlock times (recomputeNNZ/examSparsity/allocateDoubleArr):\t" + String.format("%.3f", recomputeNNZTime*1e-9) + "/" +
+					String.format("%.3f", examSparsityTime*1e-9) + "/" + String.format("%.3f", allocateDoubleArrTime*1e-9)  + ".\n");
 			}
+			
 			sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n");
 			sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n");
 			sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n");

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
index 377ebf3..f7f85c3 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
@@ -291,7 +291,7 @@ class Caffe2DML(val sc: SparkContext, val solverParam:Caffe.SolverParameter,
 	  appendVisualizationHeaders(dmlScript, numTabs)
 	  
 	  if(Caffe2DML.USE_NESTEROV_UDF) {
-	    tabDMLScript(dmlScript, numTabs).append("update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) implemented in (classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\");  \n")
+	    tabDMLScript(dmlScript, numTabs).append("update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v, double lambda) return (matrix[double] X, matrix[double] v) implemented in (classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\");  \n")
 	  }
 	  
 	  // Read and convert to one-hote encoding

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
index ae3d21d..0620e44 100644
--- a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
@@ -144,12 +144,15 @@ class AdaGrad(lambda:Double=5e-04, epsilon:Double=1e-6) extends CaffeSolver {
  */
 class Nesterov(lambda:Double=5e-04, momentum:Double=0.9) extends CaffeSolver {
   def update(dmlScript:StringBuilder, layer:CaffeLayer):Unit = {
-    l2reg_update(lambda, dmlScript, layer)
     val fn = if(Caffe2DML.USE_NESTEROV_UDF) "update_nesterov" else "sgd_nesterov::update"
+    val lastParameter = if(Caffe2DML.USE_NESTEROV_UDF) (", " + lambda) else ""
+    if(!Caffe2DML.USE_NESTEROV_UDF) {
+      l2reg_update(lambda, dmlScript, layer)
+    }
     if(layer.shouldUpdateWeight) dmlScript.append("\t").append("["+ commaSep(layer.weight, layer.weight+"_v") + "] " + 
-        "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight+"_v") + ")\n")
+        "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight+"_v") + lastParameter + ")\n")
     if(layer.shouldUpdateBias) dmlScript.append("\t").append("["+ commaSep(layer.bias, layer.bias+"_v") + "] " + 
-        "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), momentum.toString, layer.bias+"_v") + ")\n")
+        "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), momentum.toString, layer.bias+"_v") + lastParameter + ")\n")
   }
   def init(dmlScript:StringBuilder, layer:CaffeLayer):Unit = {
     if(layer.shouldUpdateWeight) dmlScript.append(layer.weight+"_v = sgd_nesterov::init(" + layer.weight + ")\n")